From 044eb2f6afba375a914ac9d8024f8f5142bb912e Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Mon, 18 Dec 2017 20:10:56 +0000
Subject: Vendor import of llvm trunk r321017:
 https://llvm.org/svn/llvm-project/llvm/trunk@321017

---
 lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp      |   29 +-
 lib/Target/PowerPC/CMakeLists.txt                  |    3 +
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp  |   86 +-
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h    |   19 +-
 lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp  |    6 +-
 .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp    |   12 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp   |    1 +
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp       |   14 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h  |   15 +-
 .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp   |    8 +-
 lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h    |   16 +
 lib/Target/PowerPC/P9InstrResources.td             |  687 +++--
 lib/Target/PowerPC/PPC.h                           |   10 +
 lib/Target/PowerPC/PPCAsmPrinter.cpp               |  170 +-
 lib/Target/PowerPC/PPCBranchCoalescing.cpp         |  784 ++++++
 lib/Target/PowerPC/PPCBranchSelector.cpp           |    2 +-
 lib/Target/PowerPC/PPCCTRLoops.cpp                 |   56 +-
 lib/Target/PowerPC/PPCEarlyReturn.cpp              |    2 +-
 lib/Target/PowerPC/PPCExpandISEL.cpp               |   87 +-
 lib/Target/PowerPC/PPCFastISel.cpp                 |   10 +-
 lib/Target/PowerPC/PPCFrameLowering.cpp            |   18 +-
 lib/Target/PowerPC/PPCFrameLowering.h              |    4 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             | 2755 +++++++++++++-------
 lib/Target/PowerPC/PPCISelLowering.cpp             |  537 +++-
 lib/Target/PowerPC/PPCISelLowering.h               |   73 +-
 lib/Target/PowerPC/PPCInstr64Bit.td                |   27 +-
 lib/Target/PowerPC/PPCInstrAltivec.td              |   38 +-
 lib/Target/PowerPC/PPCInstrFormats.td              |  107 +
 lib/Target/PowerPC/PPCInstrInfo.cpp                | 1417 +++++++++-
 lib/Target/PowerPC/PPCInstrInfo.h                  |   74 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |  219 +-
 lib/Target/PowerPC/PPCInstrVSX.td                  |  362 ++-
 lib/Target/PowerPC/PPCLoopPreIncPrep.cpp           |   65 +
 lib/Target/PowerPC/PPCMCInstLower.cpp              |   85 +-
 lib/Target/PowerPC/PPCMIPeephole.cpp               |  966 ++++++-
 lib/Target/PowerPC/PPCMachineBasicBlockUtils.h     |  198 ++
 lib/Target/PowerPC/PPCMachineFunctionInfo.cpp      |   14 +
 lib/Target/PowerPC/PPCMachineFunctionInfo.h        |   18 +
 lib/Target/PowerPC/PPCPreEmitPeephole.cpp          |   95 +
 lib/Target/PowerPC/PPCQPXLoadSplat.cpp             |    8 +-
 lib/Target/PowerPC/PPCReduceCRLogicals.cpp         |  535 ++++
 lib/Target/PowerPC/PPCRegisterInfo.cpp             |   47 +-
 lib/Target/PowerPC/PPCRegisterInfo.td              |    5 +
 lib/Target/PowerPC/PPCScheduleP9.td                |  108 +-
 lib/Target/PowerPC/PPCSubtarget.h                  |    2 +-
 lib/Target/PowerPC/PPCTLSDynamicCall.cpp           |    2 +-
 lib/Target/PowerPC/PPCTargetMachine.cpp            |   37 +-
 lib/Target/PowerPC/PPCTargetMachine.h              |    9 +-
 lib/Target/PowerPC/PPCTargetObjectFile.h           |    2 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp      |   29 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.h        |    7 +-
 lib/Target/PowerPC/PPCVSXFMAMutate.cpp             |   44 +-
 lib/Target/PowerPC/PPCVSXSwapRemoval.cpp           |    6 +-
 lib/Target/PowerPC/README.txt                      |    2 +-
 lib/Target/PowerPC/README_ALTIVEC.txt              |    2 +-
 .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp       |    6 +-
 lib/Target/PowerPC/p9-instrs.txt                   |  442 ----
 57 files changed, 8116 insertions(+), 2266 deletions(-)
 create mode 100644 lib/Target/PowerPC/PPCBranchCoalescing.cpp
 create mode 100644 lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
 create mode 100644 lib/Target/PowerPC/PPCPreEmitPeephole.cpp
 create mode 100644 lib/Target/PowerPC/PPCReduceCRLogicals.cpp
 delete mode 100644 lib/Target/PowerPC/p9-instrs.txt

(limited to 'lib/Target/PowerPC')

diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 52432a5820fbe..d6db354e02152 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -251,7 +251,6 @@ namespace {
 struct PPCOperand;
 
 class PPCAsmParser : public MCTargetAsmParser {
-  const MCInstrInfo &MII;
   bool IsPPC64;
   bool IsDarwin;
 
@@ -298,7 +297,7 @@ class PPCAsmParser : public MCTargetAsmParser {
 public:
   PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
                const MCInstrInfo &MII, const MCTargetOptions &Options)
-    : MCTargetAsmParser(Options, STI), MII(MII) {
+    : MCTargetAsmParser(Options, STI, MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     const Triple &TheTriple = STI.getTargetTriple();
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -394,6 +393,10 @@ public:
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
 
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
   bool isPPC64() const { return IsPPC64; }
 
@@ -1138,6 +1141,15 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
+  case PPC::SUBPCIS: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(1).getImm();
+    TmpInst.setOpcode(PPC::ADDPCIS);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(MCOperand::createImm(-N));
+    Inst = TmpInst;
+    break;
+  }
   case PPC::SRDI:
   case PPC::SRDIo: {
     MCInst TmpInst;
@@ -1260,6 +1272,9 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst,
   }
 }
 
+static std::string PPCMnemonicSpellCheck(StringRef S, uint64_t FBS,
+                                         unsigned VariantID = 0);
+
 bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                            OperandVector &Operands,
                                            MCStreamer &Out, uint64_t &ErrorInfo,
@@ -1275,8 +1290,13 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return false;
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
-  case Match_MnemonicFail:
-    return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = PPCMnemonicSpellCheck(
+        ((PPCOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((PPCOperand &)*Operands[0]).getLocRange());
+  }
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0ULL) {
@@ -1912,6 +1932,7 @@ extern "C" void LLVMInitializePowerPCAsmParser() {
 
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
+#define GET_MNEMONIC_SPELL_CHECKER
 #include "PPCGenAsmMatcher.inc"
 
 // Define this matcher function after the auto-generated include so we
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 7ca4c1999003a..3f173787114d4 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(PowerPCCodeGen
   PPCBoolRetToInt.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
+  PPCBranchCoalescing.cpp
   PPCCCState.cpp
   PPCCTRLoops.cpp
   PPCHazardRecognizers.cpp
@@ -38,9 +39,11 @@ add_llvm_target(PowerPCCodeGen
   PPCTOCRegDeps.cpp
   PPCTLSDynamicCall.cpp
   PPCVSXCopy.cpp
+  PPCReduceCRLogicals.cpp
   PPCVSXFMAMutate.cpp
   PPCVSXSwapRemoval.cpp
   PPCExpandISEL.cpp
+  PPCPreEmitPeephole.cpp
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index baf5902ddf584..ea709a73ebf26 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -23,7 +24,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -39,6 +39,12 @@ static cl::opt<bool>
 ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false),
              cl::desc("Prints full register names with vs{31-63} as v{0-31}"));
 
+// Prints full register names with percent symbol.
+static cl::opt<bool>
+FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden,
+                        cl::init(false),
+                        cl::desc("Prints full register names with percent"));
+
 #define PRINT_ALIAS_INSTR
 #include "PPCGenAsmWriter.inc"
 
@@ -84,7 +90,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       return;
     }
   }
-  
+
   if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
       MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
     O << "\tmr ";
@@ -94,7 +100,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     printAnnotation(O, Annot);
     return;
   }
-  
+
   if (MI->getOpcode() == PPC::RLDICR ||
       MI->getOpcode() == PPC::RLDICR_32) {
     unsigned char SH = MI->getOperand(2).getImm();
@@ -161,7 +167,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       return;
     }
   }
-  
+
   if (!printAliasInstr(MI, O))
     printInstruction(MI, O);
   printAnnotation(O, Annot);
@@ -259,7 +265,7 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
     }
     llvm_unreachable("Invalid predicate code");
   }
-  
+
   assert(StringRef(Modifier) == "reg" &&
          "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
   printOperand(MI, OpNo+1, O);
@@ -445,13 +451,57 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
     O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
 }
 
+/// showRegistersWithPercentPrefix - Check if this register name should be
+/// printed with a percentage symbol as prefix.
+bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const {
+  if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX)
+    return false;
+
+  switch (RegName[0]) {
+  default:
+    return false;
+  case 'r':
+  case 'f':
+  case 'q':
+  case 'v':
+  case 'c':
+    return true;
+  }
+}
+
+/// getVerboseConditionalRegName - This method expands the condition register
+/// when requested explicitly or targetting Darwin.
+const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum,
+                                                       unsigned RegEncoding)
+                                                       const {
+  if (!TT.isOSDarwin() && !FullRegNames)
+    return nullptr;
+  if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN)
+    return nullptr;
+  const char *CRBits[] = {
+    "lt", "gt", "eq", "un",
+    "4*cr1+lt", "4*cr1+gt", "4*cr1+eq", "4*cr1+un",
+    "4*cr2+lt", "4*cr2+gt", "4*cr2+eq", "4*cr2+un",
+    "4*cr3+lt", "4*cr3+gt", "4*cr3+eq", "4*cr3+un",
+    "4*cr4+lt", "4*cr4+gt", "4*cr4+eq", "4*cr4+un",
+    "4*cr5+lt", "4*cr5+gt", "4*cr5+eq", "4*cr5+un",
+    "4*cr6+lt", "4*cr6+gt", "4*cr6+eq", "4*cr6+un",
+    "4*cr7+lt", "4*cr7+gt", "4*cr7+eq", "4*cr7+un"
+  };
+  return CRBits[RegEncoding];
+}
+
+// showRegistersWithPrefix - This method determines whether registers
+// should be number-only or include the prefix.
+bool PPCInstPrinter::showRegistersWithPrefix() const {
+  if (TT.getOS() == Triple::AIX)
+    return false;
+  return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames;
+}
 
 /// stripRegisterPrefix - This method strips the character prefix from a
-/// register name so that only the number is left.  Used by for linux asm.
+/// register name so that only the number is left.
 static const char *stripRegisterPrefix(const char *RegName) {
-  if (FullRegNames || ShowVSRNumsAsVR)
-    return RegName;
-
   switch (RegName[0]) {
   case 'r':
   case 'f':
@@ -462,7 +512,7 @@ static const char *stripRegisterPrefix(const char *RegName) {
     return RegName + 1;
   case 'c': if (RegName[1] == 'r') return RegName + 2;
   }
-  
+
   return RegName;
 }
 
@@ -487,20 +537,24 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
         Reg = PPC::VSX32 + (Reg - PPC::VF0);
     }
 
-    const char *RegName = getRegisterName(Reg);
-    // The linux and AIX assembler does not take register prefixes.
-    if (!isDarwinSyntax())
+    const char *RegName;
+    RegName = getVerboseConditionRegName(Reg, MRI.getEncodingValue(Reg));
+    if (RegName == nullptr)
+     RegName = getRegisterName(Reg);
+    if (showRegistersWithPercentPrefix(RegName))
+      O << "%";
+    if (!showRegistersWithPrefix())
       RegName = stripRegisterPrefix(RegName);
-    
+
     O << RegName;
     return;
   }
-  
+
   if (Op.isImm()) {
     O << Op.getImm();
     return;
   }
-  
+
   assert(Op.isExpr() && "unknown operand kind in printOperand");
   Op.getExpr()->print(O, &MAI);
 }
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 9c79ffb1176c0..f000fbb98110d 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -14,21 +14,24 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
 #define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
 
 class PPCInstPrinter : public MCInstPrinter {
-  bool IsDarwin;
+  Triple TT;
+private:
+  bool showRegistersWithPercentPrefix(const char *RegName) const;
+  bool showRegistersWithPrefix() const;
+  const char *getVerboseConditionRegName(unsigned RegNum,
+                                         unsigned RegEncoding) const;
+
 public:
   PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                 const MCRegisterInfo &MRI, bool isDarwin)
-    : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
-  
-  bool isDarwinSyntax() const {
-    return IsDarwin;
-  }
-  
+                 const MCRegisterInfo &MRI, Triple T)
+    : MCInstPrinter(MAI, MII, MRI), TT(T) {}
+
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index bdad2fe8714fd..2a1de244da923 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -204,7 +204,8 @@ namespace {
   public:
     DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
 
-    MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    std::unique_ptr<MCObjectWriter>
+    createObjectWriter(raw_pwrite_stream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCMachObjectWriter(
           OS,
@@ -220,7 +221,8 @@ namespace {
     ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
       PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
 
-    MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    std::unique_ptr<MCObjectWriter>
+    createObjectWriter(raw_pwrite_stream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
     }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 1488bd5b0be61..44ee9733b16e1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -416,10 +417,9 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   }
 }
 
-MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS,
-                                               bool Is64Bit,
-                                               bool IsLittleEndian,
-                                               uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
-  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+std::unique_ptr<MCObjectWriter>
+llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                               bool IsLittleEndian, uint8_t OSABI) {
+  auto MOTW = llvm::make_unique<PPCELFObjectWriter>(Is64Bit, OSABI);
+  return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index d30bf1a56e8aa..8ac461b96b88c 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -24,6 +24,7 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
   }
   IsLittleEndian = false;
 
+  SeparatorString = "@";
   CommentString = ";";
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index e8f220ea54576..a1e4e07b25af4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -94,15 +94,6 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
-                              CodeModel::Model &CM) {
-  if (CM == CodeModel::Default) {
-    if (!TT.isOSDarwin() &&
-        (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
-      CM = CodeModel::Medium;
-  }
-}
-
 namespace {
 
 class PPCTargetAsmStreamer : public PPCTargetStreamer {
@@ -248,7 +239,7 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
                                              const MCAsmInfo &MAI,
                                              const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI) {
-  return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin());
+  return new PPCInstPrinter(MAI, MII, MRI, T);
 }
 
 extern "C" void LLVMInitializePowerPCTargetMC() {
@@ -257,9 +248,6 @@ extern "C" void LLVMInitializePowerPCTargetMC() {
     // Register the MC asm info.
     RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
 
-    // Register the MC codegen info.
-    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
-
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 893233ee2300f..80a74c09a598a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -19,6 +19,7 @@
 
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
+#include <memory>
 
 namespace llvm {
 
@@ -47,12 +48,15 @@ MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                   const MCTargetOptions &Options);
 
 /// Construct an PPC ELF object writer.
-MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                         bool IsLittleEndian, uint8_t OSABI);
+std::unique_ptr<MCObjectWriter> createPPCELFObjectWriter(raw_pwrite_stream &OS,
+                                                         bool Is64Bit,
+                                                         bool IsLittleEndian,
+                                                         uint8_t OSABI);
 /// Construct a PPC Mach-O object writer.
-MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
-                                          uint32_t CPUType,
-                                          uint32_t CPUSubtype);
+std::unique_ptr<MCObjectWriter> createPPCMachObjectWriter(raw_pwrite_stream &OS,
+                                                          bool Is64Bit,
+                                                          uint32_t CPUType,
+                                                          uint32_t CPUSubtype);
 
 /// Returns true iff Val consists of one contiguous run of 1s with any number of
 /// 0s on either side.  The 1s are allowed to wrap from LSB to MSB, so
@@ -97,6 +101,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
 // Defines symbolic names for the PowerPC instructions.
 //
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_SCHED_ENUM
 #include "PPCGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_ENUM
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index d5506277ca880..4b9055ec70419 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -374,10 +374,10 @@ void PPCMachObjectWriter::RecordPPCRelocation(
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS,
-                                                bool Is64Bit, uint32_t CPUType,
-                                                uint32_t CPUSubtype) {
+std::unique_ptr<MCObjectWriter>
+llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                uint32_t CPUType, uint32_t CPUSubtype) {
   return createMachObjectWriter(
-      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      llvm::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
       /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index acea600fbb0da..603ac960133f9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -70,6 +70,22 @@ namespace PPC {
   /// Assume the condition register is set by MI(a,b), return the predicate if
   /// we modify the instructions such that condition register is set by MI(b,a).
   Predicate getSwappedPredicate(Predicate Opcode);
+
+  /// Return the condition without hint bits.
+  inline unsigned getPredicateCondition(Predicate Opcode) {
+    return (unsigned)(Opcode & ~BR_HINT_MASK);
+  }
+
+  /// Return the hint bits of the predicate.
+  inline unsigned getPredicateHint(Predicate Opcode) {
+    return (unsigned)(Opcode & BR_HINT_MASK);
+  }
+
+  /// Return predicate consisting of specified condition and hint bits.
+  inline Predicate getPredicate(unsigned Condition, unsigned Hint) {
+    return (Predicate)((Condition & ~BR_HINT_MASK) |
+                       (Hint & BR_HINT_MASK));
+  }
 }
 }
 
diff --git a/lib/Target/PowerPC/P9InstrResources.td b/lib/Target/PowerPC/P9InstrResources.td
index aea022f887667..dc6ed16e53ce7 100644
--- a/lib/Target/PowerPC/P9InstrResources.td
+++ b/lib/Target/PowerPC/P9InstrResources.td
@@ -12,11 +12,29 @@
 // is listed here. Instructions in this file belong to itinerary classes that
 // have instructions with different resource requirements.
 //
+// The makeup of the P9 CPU is modeled as follows:
+//   - Each CPU is made up of two superslices.
+//   - Each superslice is made up of two slices. Therefore, there are 4 slices
+//      for each CPU.
+//   - Up to 6 instructions can be dispatched to each CPU. Three per superslice.
+//   - Each CPU has:
+//     - One CY (Crypto) unit P9_CY_*
+//     - One DFU (Decimal Floating Point and Quad Precision) unit P9_DFU_*
+//     - Two PM (Permute) units. One on each superslice. P9_PM_*
+//     - Two DIV (Fixed Point Divide) units. One on each superslize. P9_DIV_*
+//     - Four ALU (Fixed Point Arithmetic) units. One on each slice. P9_ALU_*
+//     - Four DP (Floating Point) units. One on each slice. P9_DP_*
+//       This also includes fixed point multiply add.
+//     - Four AGEN (Address Generation) units. One for each slice. P9_AGEN_*
+//     - Four Load/Store Queues. P9_LS_*
+//   - Each set of instructions will require a number of these resources.
 //===----------------------------------------------------------------------===//
 
-
+// Two cycle ALU vector operation that uses an entire superslice.
+//  Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+//  (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
 def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
-              DISP_1C, DISP_1C],
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     VADDCUW,
     VADDUBM,
@@ -26,47 +44,41 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
     VAND,
     VANDC,
     VCMPEQUB,
-    VCMPEQUBo,
     VCMPEQUD,
-    VCMPEQUDo,
     VCMPEQUH,
-    VCMPEQUHo,
     VCMPEQUW,
-    VCMPEQUWo,
-    VCMPGTSB,
-    VCMPGTSBo,
-    VCMPGTSD,
-    VCMPGTSDo,
-    VCMPGTSH,
-    VCMPGTSHo,
-    VCMPGTSW,
-    VCMPGTSWo,
-    VCMPGTUB,
-    VCMPGTUBo,
-    VCMPGTUD,
-    VCMPGTUDo,
-    VCMPGTUH,
-    VCMPGTUHo,
-    VCMPGTUW,
-    VCMPGTUWo,
     VCMPNEB,
-    VCMPNEBo,
     VCMPNEH,
-    VCMPNEHo,
     VCMPNEW,
-    VCMPNEWo,
     VCMPNEZB,
-    VCMPNEZBo,
     VCMPNEZH,
-    VCMPNEZHo,
     VCMPNEZW,
-    VCMPNEZWo,
     VEQV,
     VEXTSB2D,
     VEXTSB2W,
     VEXTSH2D,
     VEXTSH2W,
     VEXTSW2D,
+    VRLB,
+    VRLD,
+    VRLDMI,
+    VRLDNM,
+    VRLH,
+    VRLW,
+    VRLWMI,
+    VRLWNM,
+    VSRAB,
+    VSRAD,
+    VSRAH,
+    VSRAW,
+    VSRB,
+    VSRD,
+    VSRH,
+    VSRW,
+    VSLB,
+    VSLD,
+    VSLH,
+    VSLW,
     VMRGEW,
     VMRGOW,
     VNAND,
@@ -77,9 +89,7 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
     VORC,
     VPOPCNTB,
     VPOPCNTH,
-    VPOPCNTW,
     VSEL,
-    VSUBCUW,
     VSUBUBM,
     VSUBUDM,
     VSUBUHM,
@@ -98,6 +108,8 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
     XVNEGDP,
     XVNEGSP,
     XVXEXPDP,
+    XVIEXPSP,
+    XVXEXPSP,
     XXLAND,
     XXLANDC,
     XXLEQV,
@@ -107,28 +119,128 @@ def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
     XXLORf,
     XXLORC,
     XXLXOR,
-    XXSEL
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
-      (instrs
+    XXSEL,
     XSABSQP,
     XSCPSGNQP,
     XSIEXPQP,
     XSNABSQP,
     XSNEGQP,
-    XSXEXPQP,
-    XSABSDP,
-    XSCPSGNDP,
-    XSIEXPDP,
+    XSXEXPQP
+)>;
+
+// Restricted Dispatch ALU operation for 3 cycles. The operation runs on a
+//  slingle slice. However, since it is Restricted it requires all 3 dispatches
+//  (DISP) for that superslice.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FCMPUS,
+    FCMPUD,
+    XSTSTDCDP,
+    XSTSTDCSP
+)>;
+
+// Standard Dispatch ALU operation for 3 cycles. Only one slice used.
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSMAXCDP,
+    XSMAXDP,
+    XSMAXJDP,
+    XSMINCDP,
+    XSMINDP,
+    XSMINJDP,
+    XSTDIVDP,
+    XSTSQRTDP,
+    XSCMPEQDP,
+    XSCMPEXPDP,
+    XSCMPGEDP,
+    XSCMPGTDP,
+    XSCMPODP,
+    XSCMPUDP,
+    XSXSIGDP,
+    XSCVSPDPN
+)>;
+
+// Standard Dispatch ALU operation for 2 cycles. Only one slice used.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    ADDIStocHA,
+    ADDItocL,
+    MCRF,
+    MCRXRX,
+    SLD,
+    SRD,
+    SRAD,
+    SRADI,
+    RLDIC,
     XSNABSDP,
+    XSXEXPDP,
+    XSABSDP,
     XSNEGDP,
-    XSXEXPDP
+    XSCPSGNDP
 )>;
 
-def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// Restricted Dispatch ALU operation for 2 cycles. The operation runs on a
+//  slingle slice. However, since it is Restricted it requires all 3 dispatches
+//  (DISP) for that superslice.
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    RLDCL,
+    RLDCR,
+    RLDIMI,
+    RLDICL,
+    RLDICR,
+    RLDICL_32_64,
+    XSIEXPDP,
+    FMR,
+    FABSD,
+    FABSS,
+    FNABSD,
+    FNABSS,
+    FNEGD,
+    FNEGS,
+    FCPSGND,
+    FCPSGNS
+)>;
 
+// Three cycle ALU vector operation that uses an entire superslice.
+//  Uses both ALU units (the even ALUE and odd ALUO units), two pipelines
+//  (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    VBPERMD,
+    VABSDUB,
+    VABSDUH,
+    VABSDUW,
+    VADDUBS,
+    VADDUHS,
+    VADDUWS,
+    VAVGSB,
+    VAVGSH,
+    VAVGSW,
+    VAVGUB,
+    VAVGUH,
+    VAVGUW,
+    VCMPEQFP,
+    VCMPEQFPo,
+    VCMPGEFP,
+    VCMPGEFPo,
+    VCMPBFP,
+    VCMPBFPo,
+    VCMPGTFP,
+    VCMPGTFPo,
+    VCLZB,
+    VCLZD,
+    VCLZH,
+    VCLZW,
+    VCTZB,
+    VCTZD,
+    VCTZH,
+    VCTZW,
+    VADDSBS,
+    VADDSHS,
+    VADDSWS,
+    VMINFP,
     VMINSB,
     VMINSD,
     VMINSH,
@@ -137,55 +249,54 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
     VMINUD,
     VMINUH,
     VMINUW,
+    VMAXFP,
+    VMAXSB,
+    VMAXSD,
+    VMAXSH,
+    VMAXSW,
+    VMAXUB,
+    VMAXUD,
+    VMAXUH,
+    VMAXUW,
+    VPOPCNTW,
     VPOPCNTD,
     VPRTYBD,
     VPRTYBW,
-    VRLB,
-    VRLD,
-    VRLDMI,
-    VRLDNM,
-    VRLH,
-    VRLW,
-    VRLWMI,
-    VRLWNM,
     VSHASIGMAD,
     VSHASIGMAW,
-    VSLB,
-    VSLD,
-    VSLH,
-    VSLW,
-    VSRAB,
-    VSRAD,
-    VSRAH,
-    VSRAW,
-    VSRB,
-    VSRD,
-    VSRH,
-    VSRW,
     VSUBSBS,
     VSUBSHS,
     VSUBSWS,
     VSUBUBS,
     VSUBUHS,
     VSUBUWS,
-    XSCMPEQDP,
-    XSCMPEXPDP,
-    XSCMPGEDP,
-    XSCMPGTDP,
-    XSCMPODP,
-    XSCMPUDP,
-    XSCVSPDPN,
-    XSMAXCDP,
-    XSMAXDP,
-    XSMAXJDP,
-    XSMINCDP,
-    XSMINDP,
-    XSMINJDP,
-    XSTDIVDP,
-    XSTSQRTDP,
-    XSTSTDCDP,
-    XSTSTDCSP,
-    XSXSIGDP,
+    VSUBCUW,
+    VCMPGTSB,
+    VCMPGTSBo,
+    VCMPGTSD,
+    VCMPGTSDo,
+    VCMPGTSH,
+    VCMPGTSHo,
+    VCMPGTSW,
+    VCMPGTSWo,
+    VCMPGTUB,
+    VCMPGTUBo,
+    VCMPGTUD,
+    VCMPGTUDo,
+    VCMPGTUH,
+    VCMPGTUHo,
+    VCMPGTUW,
+    VCMPGTUWo,
+    VCMPNEBo,
+    VCMPNEHo,
+    VCMPNEWo,
+    VCMPNEZBo,
+    VCMPNEZHo,
+    VCMPNEZWo,
+    VCMPEQUBo,
+    VCMPEQUDo,
+    VCMPEQUHo,
+    VCMPEQUWo,
     XVCMPEQDP,
     XVCMPEQDPo,
     XVCMPEQSP,
@@ -198,7 +309,6 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
     XVCMPGTDPo,
     XVCMPGTSP,
     XVCMPGTSPo,
-    XVIEXPSP,
     XVMAXDP,
     XVMAXSP,
     XVMINDP,
@@ -209,58 +319,15 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C
     XVTSQRTSP,
     XVTSTDCDP,
     XVTSTDCSP,
-    XVXEXPSP,
     XVXSIGDP,
     XVXSIGSP
 )>;
 
-def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
-      (instrs
-    VABSDUB,
-    VABSDUH,
-    VABSDUW,
-    VADDSBS,
-    VADDSHS,
-    VADDSWS,
-    VADDUBS,
-    VADDUHS,
-    VADDUWS,
-    VAVGSB,
-    VAVGSH,
-    VAVGSW,
-    VAVGUB,
-    VAVGUH,
-    VAVGUW,
-    VBPERMD,
-    VCLZB,
-    VCLZD,
-    VCLZH,
-    VCLZW,
-    VCMPBFP,
-    VCMPBFPo,
-    VCMPGTFP,
-    VCMPGTFPo,
-    VCTZB,
-    VCTZD,
-    VCTZH,
-    VCTZW,
-    VMAXFP,
-    VMAXSB,
-    VMAXSD,
-    VMAXSH,
-    VMAXSW,
-    VMAXUB,
-    VMAXUD,
-    VMAXUH,
-    VMAXUW,
-    VMINFP,
-    VCMPEQFP,
-    VCMPEQFPo,
-    VCMPGEFP,
-    VCMPGEFPo
-)>;
-
-def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 7 cycle DP vector operation that uses an entire superslice.
+//  Uses both DP units (the even DPE and odd DPO units), two pipelines
+//  (EXECE, EXECO) and all three dispatches (DISP) to the given superslice.
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     VADDFP,
     VCTSXS,
@@ -367,8 +434,47 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
     VSUMSWS
 )>;
 
+// 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three
+//  dispatch units for the superslice.
 def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    FRSP,
+    FRIND,
+    FRINS,
+    FRIPD,
+    FRIPS,
+    FRIZD,
+    FRIZS,
+    FRIMD,
+    FRIMS,
+    FRE,
+    FRES,
+    FRSQRTE,
+    FRSQRTES,
+    FMADDS,
+    FMADD,
+    FMSUBS,
+    FMSUB,
+    FNMADDS,
+    FNMADD,
+    FNMSUBS,
+    FNMSUB,
+    FSELD,
+    FSELS,
+    FADDS,
+    FMULS,
+    FMUL,
+    FSUBS,
+    FCFID,
+    FCTID,
+    FCTIDZ,
+    FCFIDU,
+    FCFIDS,
+    FCFIDUS,
+    FCTIDUZ,
+    FCTIWUZ,
+    FCTIW,
+    FCTIWZ,
     XSMADDADP,
     XSMADDASP,
     XSMADDMDP,
@@ -389,7 +495,19 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
     XSNMSUBMSP
 )>;
 
+// 7 cycle Restricted DP operation and one 2 cycle ALU operation.
+//  The DP is restricted so we need a full 5 dispatches.
+def : InstRW<[P9_DPOpAndALUOp_9C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FMULo,
+    FMADDo,
+    FMSUBo,
+    FNMADDo,
+    FNMSUBo
+)>;
 
+// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units.
 def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
     XSADDDP,
@@ -397,8 +515,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
     XSCVDPHP,
     XSCVDPSP,
     XSCVDPSXDS,
+    XSCVDPSXDSs,
     XSCVDPSXWS,
     XSCVDPUXDS,
+    XSCVDPUXDSs,
     XSCVDPUXWS,
     XSCVHPDP,
     XSCVSPDP,
@@ -421,7 +541,10 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
     XSCVDPSPN
 )>;
 
-def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+// Three Cycle PM operation. Only one PM unit per superslice so we use the whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     VBPERMQ,
     VCLZLSBB,
@@ -469,7 +592,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
     VSLO,
     VSLV,
     VSPLTB,
+    VSPLTBs,
     VSPLTH,
+    VSPLTHs,
     VSPLTISB,
     VSPLTISH,
     VSPLTISW,
@@ -498,6 +623,9 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
     XXSLDWI,
     XXSPLTIB,
     XXSPLTW,
+    XXSPLTWs,
+    XXPERMDI,
+    XXPERMDIs,
     VADDCUQ,
     VADDECUQ,
     VADDEUQM,
@@ -517,7 +645,10 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
     XSXSIGQP
 )>;
 
-def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     XSADDQP,
     XSADDQPO,
@@ -536,7 +667,10 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
     XSSUBQPO
 )>;
 
-def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     XSMADDQP,
     XSMADDQPO,
@@ -550,45 +684,57 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
     XSNMSUBQPO
 )>;
 
-def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     XSDIVQP,
     XSDIVQPO
 )>;
 
-def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     XSSQRTQP,
     XSSQRTQPO
 )>;
 
-// Load Operation in IIC_LdStLFD
-
+// 5 Cycle load uses a single slice.
 def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
       (instrs
     LXSDX,
     LXVD2X,
     LXSIWZX,
     LXV,
-    LXSD
+    LXVX,
+    LXSD,
+    DFLOADf64,
+    XFLOADf64
 )>;
 
-def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle load uses a single slice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C],
       (instrs
-    LFIWZX,
-    LFDX,
-    LFD
+    COPY
 )>;
 
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
-              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// 4 Cycle Restricted load uses a single slice but the dispatch for the whole
+//  superslice.
+def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    LXSSPX,
-    LXSIWAX,
-    LXSSP
+    LFIWZX,
+    LFDX,
+    LFD
 )>;
 
-def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked Restricted Load instruction.
+// Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU
+//  operations cannot be done at the same time and so their latencies are added.
+// Full 6 dispatches are required as this is both cracked and restricted.
+def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     LFIWAX,
@@ -596,14 +742,38 @@ def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
     LFS
 )>;
 
-def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+// Cracked Load instruction.
+// Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU
+//  operations cannot be done at the same time and so their latencies are added.
+// Full 4 dispatches are required as this is a cracked instruction.
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LXSSPX,
+    LXSIWAX,
+    LXSSP,
+    DFLOADf32,
+    XFLOADf32,
+    LIWAX,
+    LIWZX
+)>;
+
+// Cracked Load that requires the PM resource.
+// Since the Load and the PM cannot be done at the same time the latencies are
+//  added. Requires 8 cycles.
+// Since the PM requires the full superslice we need both EXECE, EXECO pipelines
+//  as well as 3 dispatches for the PM. The Load requires the remaining 2
+//  dispatches.
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     LXVDSX,
+    LXVWSX,
     LXVW4X
 )>;
 
-// Store Operations in IIC_LdStSTFD.
-
+// Single slice Restricted store operation. The restricted operation requires
+//  all three dispatches for the superslice.
 def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     STFS,
@@ -613,74 +783,88 @@ def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
     STFDX,
     STXSDX,
     STXSSPX,
-    STXSIWX
+    STXSIWX,
+    DFSTOREf32,
+    DFSTOREf64,
+    XFSTOREf32,
+    XFSTOREf64,
+    STIWX
 )>;
 
-def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+// Store operation that requires the whole superslice.
+def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     STXVD2X,
     STXVW4X
 )>;
 
 
-// Divide Operations in IIC_IntDivW, IIC_IntDivD.
-
-def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     DIVW,
-    DIVWU
+    DIVWU,
+    MODSW
 )>;
 
-def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     DIVWE,
     DIVD,
     DIVWEU,
-    DIVDU
+    DIVDU,
+    MODSD,
+    MODUD,
+    MODUW
 )>;
 
-def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+// 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
     DIVDE,
     DIVDEU
 )>;
 
-def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
-              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+//  and one full superslice for the DIV operation since there is only one DIV
+//  per superslice. Latency of DIV plus ALU is 26.
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
+    DIVDo,
+    DIVDUo,
     DIVWEo,
     DIVWEUo
 )>;
 
-def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
-              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+// Cracked DIV and ALU operation. Requires one full slice for the ALU operation
+//  and one full superslice for the DIV operation since there is only one DIV
+//  per superslice. Latency of DIV plus ALU is 42.
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     DIVDEo,
     DIVDEUo
 )>;
 
-// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
-      (instrs
-    SLD,
-    SRD,
-    SRAD,
-    SRADI,
-    RLDIC
-)>;
-
-def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-      (instrs
-    RLDCL,
-    RLDCR,
-    RLDIMI,
-    RLDICL,
-    RLDICR,
-    RLDICL_32_64
-)>;
-
 // CR access instructions in _BrMCR, IIC_BrMCRX.
 
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+//  latencies are not added together. Otherwise this is like having two
+//  instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 2 cycles each.
 def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
@@ -690,13 +874,12 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
     MTCRF8
 )>;
 
-def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
-      (instrs
-    MCRF,
-    MCRXRX
-)>;
-
-def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+// Cracked, restricted, ALU operations.
+// Here the two ALU ops can actually be done in parallel and therefore the
+//  latencies are not added together. Otherwise this is like having two
+//  instructions running together on two pipelines and 6 dispatches.
+// ALU ops are 3 cycles each.
+def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     MCRFS
@@ -704,93 +887,71 @@ def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
 
 // FP Div instructions in IIC_FPDivD and IIC_FPDivS.
 
+// 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
 def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FDIV,
-    XSDIVDP
+    FDIV
 )>;
 
-def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_35C_8, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FDIVS,
-    XSDIVSP
+    FDIVo
 )>;
 
-def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
-    XVDIVSP
+    XSDIVDP
 )>;
 
-def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    XVDIVDP
+    FDIVS
 )>;
 
-// FP Instructions in IIC_FPGeneral, IIC_FPFused
+// 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU.
+def : InstRW<[P9_DPOpAndALUOp_24C_5, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FDIVSo
+)>;
 
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 22 Cycle DP Instruction. Takes one slice and 2 dispatches.
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C],
       (instrs
-    FRSP,
-    FRIND,
-    FRINS,
-    FRIPD,
-    FRIPS,
-    FRIZD,
-    FRIZS,
-    FRIMD,
-    FRIMS,
-    FRE,
-    FRES,
-    FRSQRTE,
-    FRSQRTES,
-    FMADDS,
-    FMADD,
-    FMSUBS,
-    FMSUB,
-    FNMADDS,
-    FNMADD,
-    FNMSUBS,
-    FNMSUB,
-    FSELD,
-    FSELS,
-    FADDS,
-    FMULS,
-    FMUL,
-    FSUBS,
-    FCFID,
-    FCTID,
-    FCTIDZ,
-    FCFIDU,
-    FCFIDS,
-    FCFIDUS,
-    FCTIDUZ,
-    FCTIWUZ,
-    FCTIW,
-    FCTIWZ
+    XSDIVSP
 )>;
 
-def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 24 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+//  superslice.
+def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FMR,
-    FABSD,
-    FABSS,
-    FNABSD,
-    FNABSS,
-    FNEGD,
-    FNEGS,
-    FCPSGND,
-    FCPSGNS
+    XVDIVSP
 )>;
 
-def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+// 33 Cycle DP Vector Instruction. Takes one full superslice.
+// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given
+//  superslice.
+def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C, DISP_1C],
       (instrs
-    FCMPUS,
-    FCMPUD
+    XVDIVDP
 )>;
 
 // Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
 
-def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+// Instruction cracked into three pieces. One Load and two ALU operations.
+// The Load and one of the ALU ops cannot be run at the same time and so the
+//  latencies are added together for 6 cycles. The remainaing ALU is 2 cycles.
+// Both the load and the ALU that depends on it are restricted and so they take
+//  a total of 6 dispatches. The final 2 dispatches come from the second ALU op.
+// The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load.
+def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
               IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -799,10 +960,32 @@ def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
     LFSUX
 )>;
 
-def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+// Cracked instruction made up of a Load and an ALU. The ALU does not depend on
+//  the load and so it can be run at the same time as the load. The load is also
+//  restricted. 3 dispatches are from the restricted load while the other two
+//  are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline
+//  is required for the ALU.
+def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
               DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
       (instrs
     LFDU,
     LFDUX
 )>;
 
+// Crypto Instructions
+
+// 6 Cycle CY operation. Only one CY unit per CPU so we use a whole
+//  superslice. That includes both exec pipelines (EXECO, EXECE) and all three
+//  dispatches.
+def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+  VPMSUMB,
+  VPMSUMD,
+  VPMSUMH,
+  VPMSUMW,
+  VCIPHER,
+  VCIPHERLAST,
+  VNCIPHER,
+  VNCIPHERLAST,
+  VSBOX
+)>;
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index ad92ac8ce1207..dfdec246e8686 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -26,8 +26,10 @@ namespace llvm {
   class PassRegistry;
   class FunctionPass;
   class MachineInstr;
+  class MachineOperand;
   class AsmPrinter;
   class MCInst;
+  class MCOperand;
 
   FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
@@ -39,20 +41,28 @@ namespace llvm {
   FunctionPass *createPPCVSXCopyPass();
   FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCVSXSwapRemovalPass();
+  FunctionPass *createPPCReduceCRLogicalsPass();
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCBranchCoalescingPass();
   FunctionPass *createPPCQPXLoadSplatPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
   FunctionPass *createPPCExpandISELPass();
+  FunctionPass *createPPCPreEmitPeepholePass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
+  bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
+                                         MCOperand &OutMO, AsmPrinter &AP,
+                                         bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
   void initializePPCExpandISELPass(PassRegistry &);
+  void initializePPCPreEmitPeepholePass(PassRegistry &);
   void initializePPCTLSDynamicCallPass(PassRegistry &);
+  void initializePPCMIPeepholePass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 841b8c5144641..17451900840a4 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -19,6 +19,7 @@
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
@@ -506,7 +507,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   bool isPPC64 = Subtarget->isPPC64();
   bool isDarwin = TM.getTargetTriple().isOSDarwin();
-  const Module *M = MF->getFunction()->getParent();
+  const Module *M = MF->getFunction().getParent();
   PICLevel::Level PL = M->getPICLevel();
 
   // Lower multi-instruction pseudo operations.
@@ -520,7 +521,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return LowerPATCHPOINT(SM, *MI);
 
   case PPC::MoveGOTtoLR: {
-    // Transform %LR = MoveGOTtoLR
+    // Transform %lr = MoveGOTtoLR
     // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
     // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
     // _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
@@ -541,7 +542,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::MovePCtoLR:
   case PPC::MovePCtoLR8: {
-    // Transform %LR = MovePCtoLR
+    // Transform %lr = MovePCtoLR
     // Into this, where the label is the PIC base:
     //     bl L1$pb
     // L1$pb:
@@ -559,9 +560,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::UpdateGBR: {
-    // Transform %Rd = UpdateGBR(%Rt, %Ri)
-    // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
-    //       add %Rd, %Rt, %Ri
+    // Transform %rd = UpdateGBR(%rt, %ri)
+    // Into: lwz %rt, .L0$poff - .L0$pb(%ri)
+    //       add %rd, %rt, %ri
     // Get the offset from the GOT Base Register to the GOT
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
     MCSymbol *PICOffset =
@@ -576,7 +577,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MCOperand TR = TmpInst.getOperand(1);
     const MCOperand PICR = TmpInst.getOperand(0);
 
-    // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
+    // Step 1: lwz %rt, .L$poff - .L$pb(%ri)
     TmpInst.getOperand(1) =
         MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
     TmpInst.getOperand(0) = TR;
@@ -591,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LWZtoc: {
-    // Transform %R3 = LWZtoc <ga:@min1>, %R2
+    // Transform %r3 = LWZtoc @min1, %r2
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LWZ, and the global address operand to be a
@@ -635,7 +636,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::LDtocCPT:
   case PPC::LDtocBA:
   case PPC::LDtoc: {
-    // Transform %X3 = LDtoc <ga:@min1>, %X2
+    // Transform %x3 = LDtoc @min1, %x2
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LD, and the global address operand to be a
@@ -666,7 +667,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   case PPC::ADDIStocHA: {
-    // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+    // Transform %xd = ADDIStocHA %x2, @sym
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to ADDIS8.  If the global address is external, has
@@ -713,7 +714,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LDtocL: {
-    // Transform %Xd = LDtocL <ga:@sym>, %Xs
+    // Transform %xd = LDtocL @sym, %xs
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LD.  If the global address is external, has
@@ -756,7 +757,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDItocL: {
-    // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+    // Transform %xd = ADDItocL %xs, @sym
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to ADDI8.  If the global address is external, then
@@ -787,8 +788,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDISgotTprelHA: {
-    // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
-    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    // Transform: %xd = ADDISgotTprelHA %x2, @sym
+    // Into:      %xd = ADDIS8 %x2, sym@got@tlsgd@ha
     assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
@@ -804,7 +805,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::LDgotTprelL:
   case PPC::LDgotTprelL32: {
-    // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+    // Transform %xd = LDgotTprelL @sym, %xs
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LD.
@@ -865,8 +866,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDIStlsgdHA: {
-    // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
-    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    // Transform: %xd = ADDIStlsgdHA %x2, @sym
+    // Into:      %xd = ADDIS8 %x2, sym@got@tlsgd@ha
     assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
@@ -881,11 +882,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDItlsgdL:
-    // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
-    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+    // Transform: %xd = ADDItlsgdL %xs, @sym
+    // Into:      %xd = ADDI8 %xs, sym@got@tlsgd@l
   case PPC::ADDItlsgdL32: {
-    // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
-    // Into:      %Rd = ADDI %Rs, sym@got@tlsgd
+    // Transform: %rd = ADDItlsgdL32 %rs, @sym
+    // Into:      %rd = ADDI %rs, sym@got@tlsgd
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -901,17 +902,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::GETtlsADDR:
-    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Transform: %x3 = GETtlsADDR %x3, @sym
     // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
   case PPC::GETtlsADDR32: {
-    // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+    // Transform: %r3 = GETtlsADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
     EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
     return;
   }
   case PPC::ADDIStlsldHA: {
-    // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
-    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+    // Transform: %xd = ADDIStlsldHA %x2, @sym
+    // Into:      %xd = ADDIS8 %x2, sym@got@tlsld@ha
     assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
@@ -926,11 +927,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDItlsldL:
-    // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
-    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsld@l
+    // Transform: %xd = ADDItlsldL %xs, @sym
+    // Into:      %xd = ADDI8 %xs, sym@got@tlsld@l
   case PPC::ADDItlsldL32: {
-    // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
-    // Into:      %Rd = ADDI %Rs, sym@got@tlsld
+    // Transform: %rd = ADDItlsldL32 %rs, @sym
+    // Into:      %rd = ADDI %rs, sym@got@tlsld
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -946,20 +947,20 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::GETtlsldADDR:
-    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Transform: %x3 = GETtlsldADDR %x3, @sym
     // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
   case PPC::GETtlsldADDR32: {
-    // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+    // Transform: %r3 = GETtlsldADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
     EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
     return;
   }
   case PPC::ADDISdtprelHA:
-    // Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym>
-    // Into:      %Xd = ADDIS8 %Xs, sym@dtprel@ha
+    // Transform: %xd = ADDISdtprelHA %xs, @sym
+    // Into:      %xd = ADDIS8 %xs, sym@dtprel@ha
   case PPC::ADDISdtprelHA32: {
-    // Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym>
-    // Into:      %Rd = ADDIS %Rs, sym@dtprel@ha
+    // Transform: %rd = ADDISdtprelHA32 %rs, @sym
+    // Into:      %rd = ADDIS %rs, sym@dtprel@ha
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -975,11 +976,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDIdtprelL:
-    // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
-    // Into:      %Xd = ADDI8 %Xs, sym@dtprel@l
+    // Transform: %xd = ADDIdtprelL %xs, @sym
+    // Into:      %xd = ADDI8 %xs, sym@dtprel@l
   case PPC::ADDIdtprelL32: {
-    // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
-    // Into:      %Rd = ADDI %Rs, sym@dtprel@l
+    // Transform: %rd = ADDIdtprelL32 %rs, @sym
+    // Into:      %rd = ADDI %rs, sym@dtprel@l
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
@@ -996,8 +997,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::MFOCRF:
   case PPC::MFOCRF8:
     if (!Subtarget->hasMFOCRF()) {
-      // Transform: %R3 = MFOCRF %CR7
-      // Into:      %R3 = MFCR   ;; cr7
+      // Transform: %r3 = MFOCRF %cr7
+      // Into:      %r3 = MFCR   ;; cr7
       unsigned NewOpcode =
         MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
       OutStreamer->AddComment(PPCInstPrinter::
@@ -1010,8 +1011,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::MTOCRF:
   case PPC::MTOCRF8:
     if (!Subtarget->hasMFOCRF()) {
-      // Transform: %CR7 = MTOCRF %R3
-      // Into:      MTCRF mask, %R3 ;; cr7
+      // Transform: %cr7 = MTOCRF %r3
+      // Into:      MTCRF mask, %r3 ;; cr7
       unsigned NewOpcode =
         MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
       unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
@@ -1089,7 +1090,61 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
     break;
   }
-  case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
+  case TargetOpcode::PATCHABLE_RET: {
+    unsigned RetOpcode = MI->getOperand(0).getImm();
+    MCInst RetInst;
+    RetInst.setOpcode(RetOpcode);
+    for (const auto &MO :
+         make_range(std::next(MI->operands_begin()), MI->operands_end())) {
+      MCOperand MCOp;
+      if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this, false))
+        RetInst.addOperand(MCOp);
+    }
+
+    bool IsConditional;
+    if (RetOpcode == PPC::BCCLR) {
+      IsConditional = true;
+    } else if (RetOpcode == PPC::TCRETURNdi8 || RetOpcode == PPC::TCRETURNri8 ||
+               RetOpcode == PPC::TCRETURNai8) {
+      break;
+    } else if (RetOpcode == PPC::BLR8 || RetOpcode == PPC::TAILB8) {
+      IsConditional = false;
+    } else {
+      EmitToStreamer(*OutStreamer, RetInst);
+      break;
+    }
+
+    MCSymbol *FallthroughLabel;
+    if (IsConditional) {
+      // Before:
+      //   bgtlr cr0
+      //
+      // After:
+      //   ble cr0, .end
+      // .p2align 3
+      // .begin:
+      //   blr    # lis 0, FuncId[16..32]
+      //   nop    # li  0, FuncId[0..15]
+      //   std 0, -8(1)
+      //   mflr 0
+      //   bl __xray_FunctionExit
+      //   mtlr 0
+      //   blr
+      // .end:
+      //
+      // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+      // of instructions change.
+      FallthroughLabel = OutContext.createTempSymbol();
+      EmitToStreamer(
+          *OutStreamer,
+          MCInstBuilder(PPC::BCC)
+              .addImm(PPC::InvertPredicate(
+                  static_cast<PPC::Predicate>(MI->getOperand(1).getImm())))
+              .addReg(MI->getOperand(2).getReg())
+              .addExpr(MCSymbolRefExpr::create(FallthroughLabel, OutContext)));
+      RetInst = MCInst();
+      RetInst.setOpcode(PPC::BLR8);
+    }
     // .p2align 3
     // .begin:
     //   b(lr)? # lis 0, FuncId[16..32]
@@ -1098,24 +1153,14 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //   mflr 0
     //   bl __xray_FunctionExit
     //   mtlr 0
-    // .end:
     //   b(lr)?
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
-    const MachineInstr *Next = [&] {
-      MachineBasicBlock::const_iterator It(MI);
-      assert(It != MI->getParent()->end());
-      ++It;
-      assert(It->isReturn());
-      return &*It;
-    }();
     OutStreamer->EmitCodeAlignment(8);
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     OutStreamer->EmitLabel(BeginOfSled);
-    MCInst TmpInst;
-    LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
-    EmitToStreamer(*OutStreamer, TmpInst);
+    EmitToStreamer(*OutStreamer, RetInst);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
     EmitToStreamer(
         *OutStreamer,
@@ -1127,15 +1172,18 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                            OutContext.getOrCreateSymbol("__xray_FunctionExit"),
                            OutContext)));
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer, RetInst);
+    if (IsConditional)
+      OutStreamer->EmitLabel(FallthroughLabel);
     recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
     break;
   }
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+    llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");
   case TargetOpcode::PATCHABLE_TAIL_CALL:
-  case TargetOpcode::PATCHABLE_RET:
-    // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
-    // lower to a PPC::B instruction. The PPC::B instruction is generated
-    // before it, and handled by the normal case.
-    llvm_unreachable("Tail call is handled in the normal case. See comments"
+    // TODO: Define a trampoline `__xray_FunctionTailExit` and differentiate a
+    // normal function exit from a tail exit.
+    llvm_unreachable("Tail call is handled in the normal case. See comments "
                      "around this assert.");
   }
 }
@@ -1180,7 +1228,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   // linux/ppc32 - Normal entry label.
   if (!Subtarget->isPPC64() &&
       (!isPositionIndependent() ||
-       MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
+       MF->getFunction().getParent()->getPICLevel() == PICLevel::SmallPIC))
     return AsmPrinter::EmitFunctionEntryLabel();
 
   if (!Subtarget->isPPC64()) {
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
new file mode 100644
index 0000000000000..32d801b13ded9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -0,0 +1,784 @@
+//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Coalesce basic blocks guarded by the same branch condition into a single
+/// basic block.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-branch-coalescing"
+
+STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
+STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
+STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
+
+namespace llvm {
+    void initializePPCBranchCoalescingPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+//                               PPCBranchCoalescing
+//===----------------------------------------------------------------------===//
+///
+/// Improve scheduling by coalescing branches that depend on the same condition.
+/// This pass looks for blocks that are guarded by the same branch condition
+/// and attempts to merge the blocks together. Such opportunities arise from
+/// the expansion of select statements in the IR.
+///
+/// This pass does not handle implicit operands on branch statements. In order
+/// to run on targets that use implicit operands, changes need to be made in the
+/// canCoalesceBranch and canMerge methods.
+///
+/// Example: the following LLVM IR
+///
+///     %test = icmp eq i32 %x 0
+///     %tmp1 = select i1 %test, double %a, double 2.000000e-03
+///     %tmp2 = select i1 %test, double %b, double 5.000000e-03
+///
+/// expands to the following machine code:
+///
+/// %bb.0: derived from LLVM BB %entry
+///    Live Ins: %f1 %f3 %x6
+///        <SNIP1>
+///        %0 = COPY %f1; F8RC:%0
+///        %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
+///        %8 = LXSDX %zero8, killed %7, implicit %rm;
+///                    mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
+///        BCC 76, %5, <%bb.2>; CRRC:%5
+///    Successors according to CFG: %bb.1(?%) %bb.2(?%)
+///
+/// %bb.1: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.0
+///    Successors according to CFG: %bb.2(?%)
+///
+/// %bb.2: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.0 %bb.1
+///        %9 = PHI %8, <%bb.1>, %0, <%bb.0>;
+///                    F8RC:%9,%8,%0
+///        <SNIP2>
+///        BCC 76, %5, <%bb.4>; CRRC:%5
+///    Successors according to CFG: %bb.3(?%) %bb.4(?%)
+///
+/// %bb.3: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.2
+///    Successors according to CFG: %bb.4(?%)
+///
+/// %bb.4: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.2 %bb.3
+///        %13 = PHI %12, <%bb.3>, %2, <%bb.2>;
+///                     F8RC:%13,%12,%2
+///        <SNIP3>
+///        BLR8 implicit %lr8, implicit %rm, implicit %f1
+///
+/// When this pattern is detected, branch coalescing will try to collapse
+/// it by moving code in %bb.2 to %bb.0 and/or %bb.4 and removing %bb.3.
+///
+/// If all conditions are meet, IR should collapse to:
+///
+/// %bb.0: derived from LLVM BB %entry
+///    Live Ins: %f1 %f3 %x6
+///        <SNIP1>
+///        %0 = COPY %f1; F8RC:%0
+///        %5 = CMPLWI killed %4, 0; CRRC:%5 GPRC:%4
+///        %8 = LXSDX %zero8, killed %7, implicit %rm;
+///                     mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
+///        <SNIP2>
+///        BCC 76, %5, <%bb.4>; CRRC:%5
+///    Successors according to CFG: %bb.1(0x2aaaaaaa / 0x80000000 = 33.33%)
+///      %bb.4(0x55555554 / 0x80000000 = 66.67%)
+///
+/// %bb.1: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.0
+///    Successors according to CFG: %bb.4(0x40000000 / 0x80000000 = 50.00%)
+///
+/// %bb.4: derived from LLVM BB %entry
+///    Predecessors according to CFG: %bb.0 %bb.1
+///        %9 = PHI %8, <%bb.1>, %0, <%bb.0>;
+///                    F8RC:%9,%8,%0
+///        %13 = PHI %12, <%bb.1>, %2, <%bb.0>;
+///                     F8RC:%13,%12,%2
+///        <SNIP3>
+///        BLR8 implicit %lr8, implicit %rm, implicit %f1
+///
+/// Branch Coalescing does not split blocks, it moves everything in the same
+/// direction ensuring it does not break use/definition semantics.
+///
+/// PHI nodes and its corresponding use instructions are moved to its successor
+/// block if there are no uses within the successor block PHI nodes.  PHI
+/// node ordering cannot be assumed.
+///
+/// Non-PHI can be moved up to the predecessor basic block or down to the
+/// successor basic block following any PHI instructions. Whether it moves
+/// up or down depends on whether the register(s) defined in the instructions
+/// are used in current block or in any PHI instructions at the beginning of
+/// the successor block.
+
+namespace {
+
+class PPCBranchCoalescing : public MachineFunctionPass {
+  struct CoalescingCandidateInfo {
+    MachineBasicBlock *BranchBlock;       // Block containing the branch
+    MachineBasicBlock *BranchTargetBlock; // Block branched to
+    MachineBasicBlock *FallThroughBlock;  // Fall-through if branch not taken
+    SmallVector<MachineOperand, 4> Cond;
+    bool MustMoveDown;
+    bool MustMoveUp;
+
+    CoalescingCandidateInfo();
+    void clear();
+  };
+
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+
+  void initialize(MachineFunction &F);
+  bool canCoalesceBranch(CoalescingCandidateInfo &Cand);
+  bool identicalOperands(ArrayRef<MachineOperand> OperandList1,
+                         ArrayRef<MachineOperand> OperandList2) const;
+  bool validateCandidates(CoalescingCandidateInfo &SourceRegion,
+                          CoalescingCandidateInfo &TargetRegion) const;
+
+public:
+  static char ID;
+
+  PPCBranchCoalescing() : MachineFunctionPass(ID) {
+    initializePPCBranchCoalescingPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Branch Coalescing"; }
+
+  bool mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                       CoalescingCandidateInfo &TargetRegion);
+  bool canMoveToBeginning(const MachineInstr &MI,
+                          const MachineBasicBlock &MBB) const;
+  bool canMoveToEnd(const MachineInstr &MI,
+                    const MachineBasicBlock &MBB) const;
+  bool canMerge(CoalescingCandidateInfo &SourceRegion,
+                CoalescingCandidateInfo &TargetRegion) const;
+  void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB,
+                         MachineBasicBlock *TargetRegionMBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char PPCBranchCoalescing::ID = 0;
+/// createPPCBranchCoalescingPass - returns an instance of the Branch Coalescing
+/// Pass
+FunctionPass *llvm::createPPCBranchCoalescingPass() {
+  return new PPCBranchCoalescing();
+}
+
+INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE,
+                      "Branch Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
+                    false, false)
+
+PPCBranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
+    : BranchBlock(nullptr), BranchTargetBlock(nullptr),
+      FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {}
+
+void PPCBranchCoalescing::CoalescingCandidateInfo::clear() {
+  BranchBlock = nullptr;
+  BranchTargetBlock = nullptr;
+  FallThroughBlock = nullptr;
+  Cond.clear();
+  MustMoveDown = false;
+  MustMoveUp = false;
+}
+
+void PPCBranchCoalescing::initialize(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = &getAnalysis<MachinePostDominatorTree>();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+///
+/// Analyze the branch statement to determine if it can be coalesced. This
+/// method analyses the branch statement for the given candidate to determine
+/// if it can be coalesced. If the branch can be coalesced, then the
+/// BranchTargetBlock and the FallThroughBlock are recorded in the specified
+/// Candidate.
+///
+///\param[in,out] Cand The coalescing candidate to analyze
+///\return true if and only if the branch can be coalesced, false otherwise
+///
+bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
+  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
+               << " can be coalesced:");
+  MachineBasicBlock *FalseMBB = nullptr;
+
+  if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
+                         Cand.Cond)) {
+    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+    return false;
+  }
+
+  for (auto &I : Cand.BranchBlock->terminators()) {
+    DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+    if (!I.isBranch())
+      continue;
+
+    // The analyzeBranch method does not include any implicit operands.
+    // This is not an issue on PPC but must be handled on other targets.
+    // For this pass to be made target-independent, the analyzeBranch API
+    // need to be updated to support implicit operands and there would
+    // need to be a way to verify that any implicit operands would not be
+    // clobbered by merging blocks.  This would include identifying the
+    // implicit operands as well as the basic block they are defined in.
+    // This could be done by changing the analyzeBranch API to have it also
+    // record and return the implicit operands and the blocks where they are
+    // defined. Alternatively, the BranchCoalescing code would need to be
+    // extended to identify the implicit operands.  The analysis in canMerge
+    // must then be extended to prove that none of the implicit operands are
+    // changed in the blocks that are combined during coalescing.
+    if (I.getNumOperands() != I.getNumExplicitOperands()) {
+      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
+                   << "\n");
+      return false;
+    }
+  }
+
+  if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
+    DEBUG(dbgs() << "EH Pad - skip\n");
+    return false;
+  }
+
+  // For now only consider triangles (i.e, BranchTargetBlock is set,
+  // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
+  if (!Cand.BranchTargetBlock || FalseMBB ||
+      !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
+    DEBUG(dbgs() << "Does not form a triangle - skip\n");
+    return false;
+  }
+
+  // Ensure there are only two successors
+  if (Cand.BranchBlock->succ_size() != 2) {
+    DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+    return false;
+  }
+
+  // Sanity check - the block must be able to fall through
+  assert(Cand.BranchBlock->canFallThrough() &&
+         "Expecting the block to fall through!");
+
+  // We have already ensured there are exactly two successors to
+  // BranchBlock and that BranchTargetBlock is a successor to BranchBlock.
+  // Ensure the single fall though block is empty.
+  MachineBasicBlock *Succ =
+    (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock)
+    ? *Cand.BranchBlock->succ_rbegin()
+    : *Cand.BranchBlock->succ_begin();
+
+  assert(Succ && "Expecting a valid fall-through block\n");
+
+  if (!Succ->empty()) {
+      DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+      return false;
+  }
+
+  if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
+      DEBUG(dbgs()
+            << "Successor of fall through block is not branch taken block\n");
+      return false;
+  }
+
+  Cand.FallThroughBlock = Succ;
+  DEBUG(dbgs() << "Valid Candidate\n");
+  return true;
+}
+
+///
+/// Determine if the two operand lists are identical
+///
+/// \param[in] OpList1 operand list
+/// \param[in] OpList2 operand list
+/// \return true if and only if the operands lists are identical
+///
+bool PPCBranchCoalescing::identicalOperands(
+    ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
+
+  if (OpList1.size() != OpList2.size()) {
+    DEBUG(dbgs() << "Operand list is different size\n");
+    return false;
+  }
+
+  for (unsigned i = 0; i < OpList1.size(); ++i) {
+    const MachineOperand &Op1 = OpList1[i];
+    const MachineOperand &Op2 = OpList2[i];
+
+    DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+                 << "Op2: " << Op2 << "\n");
+
+    if (Op1.isIdenticalTo(Op2)) {
+      // filter out instructions with physical-register uses
+      if (Op1.isReg() && TargetRegisterInfo::isPhysicalRegister(Op1.getReg())
+        // If the physical register is constant then we can assume the value
+        // has not changed between uses.
+          && !(Op1.isUse() && MRI->isConstantPhysReg(Op1.getReg()))) {
+        DEBUG(dbgs() << "The operands are not provably identical.\n");
+        return false;
+      }
+      DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+      continue;
+    }
+
+    // If the operands are not identical, but are registers, check to see if the
+    // definition of the register produces the same value. If they produce the
+    // same value, consider them to be identical.
+    if (Op1.isReg() && Op2.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+      MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
+      MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
+      if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
+        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+                     << " produce the same value!\n");
+      } else {
+        DEBUG(dbgs() << "Operands produce different values\n");
+        return false;
+      }
+    } else {
+      DEBUG(dbgs() << "The operands are not provably identical.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+///
+/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB
+/// and update them to refer to the new block.  PHI node ordering
+/// cannot be assumed so it does not matter where the PHI instructions
+/// are moved to in TargetMBB.
+///
+/// \param[in] SourceMBB block to move PHI instructions from
+/// \param[in] TargetMBB block to move PHI instructions to
+///
+void PPCBranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
+                                         MachineBasicBlock *TargetMBB) {
+
+  MachineBasicBlock::iterator MI = SourceMBB->begin();
+  MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
+
+  if (MI == ME) {
+    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+    return;
+  }
+
+  // Update all PHI instructions in SourceMBB and move to top of TargetMBB
+  for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) {
+    MachineInstr &PHIInst = *Iter;
+    for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = PHIInst.getOperand(i);
+      if (MO.getMBB() == SourceMBB)
+        MO.setMBB(TargetMBB);
+    }
+  }
+  TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME);
+}
+
+///
+/// This function checks if MI can be moved to the beginning of the TargetMBB
+/// following PHI instructions. A MI instruction can be moved to beginning of
+/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to beginning of TargetMBB,
+///         false otherwise.
+///
+bool PPCBranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
+                                          const MachineBasicBlock &TargetMBB
+                                          ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Def : MI.defs()) { // Looking at Def
+    for (auto &Use : MRI->use_instructions(Def.getReg())) {
+      if (Use.isPHI() && Use.getParent() == &TargetMBB) {
+        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
+       return false;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the beginning.\n");
+  return true;
+}
+
+///
+/// This function checks if MI can be moved to the end of the TargetMBB,
+/// immediately before the first terminator.  A MI instruction can be moved
+/// to then end of the TargetMBB if no PHI node defines what MI uses within
+/// it's own MBB.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to end of TargetMBB,
+///         false otherwise.
+///
+bool PPCBranchCoalescing::canMoveToEnd(const MachineInstr &MI,
+                                    const MachineBasicBlock &TargetMBB
+                                    ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Use : MI.uses()) {
+    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+      MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
+      if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
+        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n");
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the end.\n");
+  return true;
+}
+
+///
+/// This method checks to ensure the two coalescing candidates follows the
+/// expected pattern required for coalescing.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion; false otherwise.
+///
+bool PPCBranchCoalescing::validateCandidates(
+    CoalescingCandidateInfo &SourceRegion,
+    CoalescingCandidateInfo &TargetRegion) const {
+
+  if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock)
+    llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion");
+  else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock))
+    llvm_unreachable("Expecting TargetRegion to dominate SourceRegion");
+  else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock))
+    llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion");
+  else if (!TargetRegion.FallThroughBlock->empty() ||
+           !SourceRegion.FallThroughBlock->empty())
+    llvm_unreachable("Expecting fall-through blocks to be empty");
+
+  return true;
+}
+
+///
+/// This method determines whether the two coalescing candidates can be merged.
+/// In order to be merged, all instructions must be able to
+///   1. Move to the beginning of the SourceRegion.BranchTargetBlock;
+///   2. Move to the end of the TargetRegion.BranchBlock.
+/// Merging involves moving the instructions in the
+/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock).
+///
+/// This function first try to move instructions from the
+/// TargetRegion.BranchTargetBlock down, to the beginning of the
+/// SourceRegion.BranchTargetBlock. This is not possible if any register defined
+/// in TargetRegion.BranchTargetBlock is used in a PHI node in the
+/// SourceRegion.BranchTargetBlock. In this case, check whether the statement
+/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately
+/// before the branch statement). If it cannot move, then these blocks cannot
+/// be merged.
+///
+/// Note that there is no analysis for moving instructions past the fall-through
+/// blocks because they are confirmed to be empty. An assert is thrown if they
+/// are not.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+///         into a block in TargetRegion, false otherwise.
+///
+bool PPCBranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
+                                CoalescingCandidateInfo &TargetRegion) const {
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Walk through PHI nodes first and see if they force the merge into the
+  // SourceRegion.BranchTargetBlock.
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->instr_begin(),
+           E = SourceRegion.BranchBlock->getFirstNonPHI();
+       I != E; ++I) {
+    for (auto &Def : I->defs())
+      for (auto &Use : MRI->use_instructions(Def.getReg())) {
+        if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
+          DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
+                          "PHI within branch target block -- can't merge\n");
+          NumPHINotMoved++;
+          return false;
+        }
+        if (Use.getParent() == SourceRegion.BranchBlock) {
+          DEBUG(dbgs() << "PHI " << *I
+                       << " defines register used in this "
+                          "block -- all must move down\n");
+          SourceRegion.MustMoveDown = true;
+        }
+      }
+  }
+
+  // Walk through the MI to see if they should be merged into
+  // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down)
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->getFirstNonPHI(),
+           E = SourceRegion.BranchBlock->end();
+       I != E; ++I) {
+    if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move down - must move up!\n");
+      SourceRegion.MustMoveUp = true;
+    }
+    if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move up - must move down!\n");
+      SourceRegion.MustMoveDown = true;
+    }
+  }
+
+  return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true;
+}
+
+/// Merge the instructions from SourceRegion.BranchBlock,
+/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into
+/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and
+/// TargetRegion.FallThroughBlock respectively.
+///
+/// The successors for blocks in TargetRegion will be updated to use the
+/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion
+/// will be removed from the function.
+///
+/// A region consists of a BranchBlock, a FallThroughBlock, and a
+/// BranchTargetBlock. Branch coalesce works on patterns where the
+/// TargetRegion's BranchTargetBlock must also be the SourceRegions's
+/// BranchBlock.
+///
+///  Before mergeCandidates:
+///
+///  +---------------------------+
+///  |  TargetRegion.BranchBlock |
+///  +---------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  TargetRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  TargetRegion.BranchTargetBlock  |
+///  |  SourceRegion.BranchBlock        |
+///  +----------------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  SourceRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+///  After mergeCandidates:
+///
+///  +-----------------------------+
+///  |  TargetRegion.BranchBlock   |
+///  |  SourceRegion.BranchBlock   |
+///  +-----------------------------+
+///     /        |
+///    /   +---------------------------------+
+///   |    |  TargetRegion.FallThroughBlock  |
+///   |    |  SourceRegion.FallThroughBlock  |
+///    \   +---------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+/// \param[in] SourceRegion The candidate to move blocks from
+/// \param[in] TargetRegion The candidate to move blocks to
+///
+bool PPCBranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                                       CoalescingCandidateInfo &TargetRegion) {
+
+  if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) {
+    llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!");
+    return false;
+  }
+
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Start the merging process by first handling the BranchBlock.
+  // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block
+  moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+
+  // Move remaining instructions in SourceRegion.BranchBlock into
+  // TargetRegion.BranchBlock
+  MachineBasicBlock::iterator firstInstr =
+      SourceRegion.BranchBlock->getFirstNonPHI();
+  MachineBasicBlock::iterator lastInstr =
+      SourceRegion.BranchBlock->getFirstTerminator();
+
+  MachineBasicBlock *Source = SourceRegion.MustMoveDown
+                                  ? SourceRegion.BranchTargetBlock
+                                  : TargetRegion.BranchBlock;
+
+  MachineBasicBlock::iterator Target =
+      SourceRegion.MustMoveDown
+          ? SourceRegion.BranchTargetBlock->getFirstNonPHI()
+          : TargetRegion.BranchBlock->getFirstTerminator();
+
+  Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr);
+
+  // Once PHI and instructions have been moved we need to clean up the
+  // control flow.
+
+  // Remove SourceRegion.FallThroughBlock before transferring successors of
+  // SourceRegion.BranchBlock to TargetRegion.BranchBlock.
+  SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock);
+  TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.BranchBlock);
+  // Update branch in TargetRegion.BranchBlock to jump to
+  // SourceRegion.BranchTargetBlock
+  // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock.
+  TargetRegion.BranchBlock->ReplaceUsesOfBlockWith(
+      SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+  // Remove the branch statement(s) in SourceRegion.BranchBlock
+  MachineBasicBlock::iterator I =
+      SourceRegion.BranchBlock->terminators().begin();
+  while (I != SourceRegion.BranchBlock->terminators().end()) {
+    MachineInstr &CurrInst = *I;
+    ++I;
+    if (CurrInst.isBranch())
+      CurrInst.eraseFromParent();
+  }
+
+  // Fall-through block should be empty since this is part of the condition
+  // to coalesce the branches.
+  assert(TargetRegion.FallThroughBlock->empty() &&
+         "FallThroughBlocks should be empty!");
+
+  // Transfer successor information and move PHIs down to the
+  // branch-taken block.
+  TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.FallThroughBlock);
+  TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock);
+
+  // Remove the blocks from the function.
+  assert(SourceRegion.BranchBlock->empty() &&
+         "Expecting branch block to be empty!");
+  SourceRegion.BranchBlock->eraseFromParent();
+
+  assert(SourceRegion.FallThroughBlock->empty() &&
+         "Expecting fall-through block to be empty!\n");
+  SourceRegion.FallThroughBlock->eraseFromParent();
+
+  NumBlocksCoalesced++;
+  return true;
+}
+
+bool PPCBranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
+
+  if (skipFunction(MF.getFunction()) || MF.empty())
+    return false;
+
+  bool didSomething = false;
+
+  DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+  initialize(MF);
+
+  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+
+  CoalescingCandidateInfo Cand1, Cand2;
+  // Walk over blocks and find candidates to merge
+  // Continue trying to merge with the first candidate found, as long as merging
+  // is successfull.
+  for (MachineBasicBlock &MBB : MF) {
+    bool MergedCandidates = false;
+    do {
+      MergedCandidates = false;
+      Cand1.clear();
+      Cand2.clear();
+
+      Cand1.BranchBlock = &MBB;
+
+      // If unable to coalesce the branch, then continue to next block
+      if (!canCoalesceBranch(Cand1))
+        break;
+
+      Cand2.BranchBlock = Cand1.BranchTargetBlock;
+      if (!canCoalesceBranch(Cand2))
+        break;
+
+      // Sanity check
+      // The branch-taken block of the second candidate should post-dominate the
+      // first candidate
+      assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
+             "Branch-taken block should post-dominate first candidate");
+
+      if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
+        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
+                     << Cand2.BranchBlock->getNumber()
+                     << " have different branches\n");
+        break;
+      }
+      if (!canMerge(Cand2, Cand1)) {
+        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
+                     << " and " << Cand2.BranchBlock->getNumber() << "\n");
+        NumBlocksNotCoalesced++;
+        continue;
+      }
+      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+      MergedCandidates = mergeCandidates(Cand2, Cand1);
+      if (MergedCandidates)
+        didSomething = true;
+
+      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+    } while (MergedCandidates);
+  }
+
+#ifndef NDEBUG
+  // Verify MF is still valid after branch coalescing
+  if (didSomething)
+    MF.verify(nullptr, "Error in code produced by branch coalescing");
+#endif // NDEBUG
+
+  DEBUG(dbgs() << "Finished Branch Coalescing\n");
+  return didSomething;
+}
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index d0b66f9bca09a..64b8f1168beb8 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -23,9 +23,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-branch-select"
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 53f33ac1fc0ed..fc638829378ab 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -26,12 +26,17 @@
 #include "PPC.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetTransformInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -64,6 +69,13 @@ using namespace llvm;
 static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
 #endif
 
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+                      cl::desc("Loops with a constant trip count smaller than "
+                               "this value will not use the count register."));
+
 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
 
 namespace llvm {
@@ -95,6 +107,8 @@ namespace {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
   private:
@@ -107,10 +121,12 @@ namespace {
     const PPCTargetLowering *TLI;
     const DataLayout *DL;
     const TargetLibraryInfo *LibInfo;
+    const TargetTransformInfo *TTI;
     LoopInfo *LI;
     ScalarEvolution *SE;
     DominatorTree *DT;
     bool PreserveLCSSA;
+    TargetSchedModel SchedModel;
   };
 
   char PPCCTRLoops::ID = 0;
@@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DL = &F.getParent()->getDataLayout();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
@@ -243,8 +260,8 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
       // Inline ASM is okay, unless it clobbers the ctr register.
       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-	if (asmClobbersCTR(IA))
-	  return true;
+        if (asmClobbersCTR(IA))
+          return true;
         continue;
       }
 
@@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
 
   return false;
 }
-
 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   bool MadeChange = false;
 
+  // Do not convert small short loops to CTR loop.
+  unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
+  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+    SmallPtrSet<const Value *, 32> EphValues;
+    auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+        *L->getHeader()->getParent());
+    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+    CodeMetrics Metrics;
+    for (BasicBlock *BB : L->blocks())
+      Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+    // 6 is an approximate latency for the mtctr instruction.
+    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+      return false;
+  }
+
   // Process nested loops first.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
     MadeChange |= convertToCTRLoop(*I);
@@ -659,12 +690,11 @@ check_block:
     }
 
     if (I != BI && clobbersCTR(*I)) {
-      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
-                      MBB->getFullName() << ") instruction " << *I <<
-                      " clobbers CTR, invalidating " << "BB#" <<
-                      BI->getParent()->getNumber() << " (" <<
-                      BI->getParent()->getFullName() << ") instruction " <<
-                      *BI << "\n");
+      DEBUG(dbgs() << printMBBReference(*MBB) << " (" << MBB->getFullName()
+                   << ") instruction " << *I << " clobbers CTR, invalidating "
+                   << printMBBReference(*BI->getParent()) << " ("
+                   << BI->getParent()->getFullName() << ") instruction " << *BI
+                   << "\n");
       return false;
     }
 
@@ -678,10 +708,10 @@ check_block:
   if (CheckPreds) {
 queue_preds:
     if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
-      DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
-                      BI->getParent()->getNumber() << " (" <<
-                      BI->getParent()->getFullName() << ") instruction " <<
-                      *BI << "\n");
+      DEBUG(dbgs() << "Unable to find a MTCTR instruction for "
+                   << printMBBReference(*BI->getParent()) << " ("
+                   << BI->getParent()->getFullName() << ") instruction " << *BI
+                   << "\n");
       return false;
     }
 
diff --git a/lib/Target/PowerPC/PPCEarlyReturn.cpp b/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 811e4dd9dfe16..1699463c0a4bc 100644
--- a/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -173,7 +173,7 @@ protected:
 
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
-      if (skipFunction(*MF.getFunction()))
+      if (skipFunction(MF.getFunction()))
         return false;
 
       TII = MF.getSubtarget().getInstrInfo();
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index 41e3190c3eec7..b00e98b63e346 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -59,6 +59,8 @@ class PPCExpandISEL : public MachineFunctionPass {
   typedef SmallDenseMap<int, BlockISELList> ISELInstructionList;
 
   // A map of MBB numbers to their lists of contained ISEL instructions.
+  // Please note when we traverse this list and expand ISEL, we only remove
+  // the ISEL from the MBB not from this list.
   ISELInstructionList ISELInstructions;
 
   /// Initialize the object.
@@ -124,9 +126,6 @@ public:
 #endif
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    if (!isExpandISELEnabled(MF))
-      return false;
-
     DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
     initialize(MF);
 
@@ -171,7 +170,7 @@ bool PPCExpandISEL::collectISELInstructions() {
 #ifndef NDEBUG
 void PPCExpandISEL::DumpISELInstructions() const {
   for (const auto &I : ISELInstructions) {
-    DEBUG(dbgs() << "BB#" << I.first << ":\n");
+    DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first)) << ":\n");
     for (const auto &VI : I.second)
       DEBUG(dbgs() << "    "; VI->print(dbgs()));
   }
@@ -190,26 +189,71 @@ bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) {
 }
 
 void PPCExpandISEL::expandAndMergeISELs() {
+  bool ExpandISELEnabled = isExpandISELEnabled(*MF);
+
   for (auto &BlockList : ISELInstructions) {
-    DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first
+    DEBUG(dbgs() << "Expanding ISEL instructions in "
+                 << printMBBReference(*MF->getBlockNumbered(BlockList.first))
                  << "\n");
-
     BlockISELList &CurrentISELList = BlockList.second;
     auto I = CurrentISELList.begin();
     auto E = CurrentISELList.end();
 
     while (I != E) {
-      BlockISELList SubISELList;
-
-      SubISELList.push_back(*I++);
-
-      // Collect the ISELs that can be merged together.
-      while (I != E && canMerge(SubISELList.back(), *I))
+      assert(isISEL(**I) && "Expecting an ISEL instruction");
+      MachineOperand &Dest = (*I)->getOperand(0);
+      MachineOperand &TrueValue = (*I)->getOperand(1);
+      MachineOperand &FalseValue = (*I)->getOperand(2);
+
+      // Special case 1, all registers used by ISEL are the same one.
+      // The non-redundant isel 0, 0, 0, N would not satisfy these conditions
+      // as it would be ISEL %R0, %ZERO, %R0, %CRN.
+      if (useSameRegister(Dest, TrueValue) &&
+          useSameRegister(Dest, FalseValue)) {
+        DEBUG(dbgs() << "Remove redudant ISEL instruction: " << **I << "\n");
+        // FIXME: if the CR field used has no other uses, we could eliminate the
+        // instruction that defines it. This would have to be done manually
+        // since this pass runs too late to run DCE after it.
+        NumRemoved++;
+        (*I)->eraseFromParent();
+        I++;
+      } else if (useSameRegister(TrueValue, FalseValue)) {
+        // Special case 2, the two input registers used by ISEL are the same.
+        // Note: the non-foldable isel RX, 0, 0, N would not satisfy this
+        // condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it
+        // safe to fold ISEL to MR(OR) instead of ADDI.
+        MachineBasicBlock *MBB = (*I)->getParent();
+        DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy:\n");
+        DEBUG(dbgs() << "ISEL: " << **I << "\n");
+        NumFolded++;
+        // Note: we're using both the TrueValue and FalseValue operands so as
+        // not to lose the kill flag if it is set on either of them.
+        BuildMI(*MBB, (*I), dl, TII->get(isISEL8(**I) ? PPC::OR8 : PPC::OR))
+            .add(Dest)
+            .add(TrueValue)
+            .add(FalseValue);
+        (*I)->eraseFromParent();
+        I++;
+      } else if (ExpandISELEnabled) { // Normal cases expansion enabled
+        DEBUG(dbgs() << "Expand ISEL instructions:\n");
+        DEBUG(dbgs() << "ISEL: " << **I << "\n");
+        BlockISELList SubISELList;
         SubISELList.push_back(*I++);
-
-      expandMergeableISELs(SubISELList);
-    }
-  }
+        // Collect the ISELs that can be merged together.
+        // This will eat up ISEL instructions without considering whether they
+        // may be redundant or foldable to a register copy. So we still keep
+        // the handleSpecialCases() downstream to handle them.
+        while (I != E && canMerge(SubISELList.back(), *I)) {
+          DEBUG(dbgs() << "ISEL: " << **I << "\n");
+          SubISELList.push_back(*I++);
+        }
+
+        expandMergeableISELs(SubISELList);
+      } else { // Normal cases expansion disabled
+        I++; // leave the ISEL as it is
+      }
+    } // end while
+  } // end for
 }
 
 void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
@@ -232,13 +276,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
     // Similarly, if at least one of the ISEL instructions satisfy the
     // following condition, we need the False Block:
     // The Dest Register and False Value Register are not the same.
-
     bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
     bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
 
     // Special case 1, all registers used by ISEL are the same one.
     if (!IsADDIInstRequired && !IsORIInstRequired) {
       DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+      // FIXME: if the CR field used has no other uses, we could eliminate the
+      // instruction that defines it. This would have to be done manually
+      // since this pass runs too late to run DCE after it.
       NumRemoved++;
       (*MI)->eraseFromParent();
       // Setting MI to the erase result keeps the iterator valid and increased.
@@ -253,14 +299,15 @@ void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
     // PPC::ZERO8 will be used for the first operand if the value is meant to
     // be zero. In this case, the useSameRegister method will return false,
     // thereby preventing this ISEL from being folded.
-
     if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
       DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
       NumFolded++;
-      BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI))
+      // Note: we're using both the TrueValue and FalseValue operands so as
+      // not to lose the kill flag if it is set on either of them.
+      BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::OR8 : PPC::OR))
           .add(Dest)
           .add(TrueValue)
-          .add(MachineOperand::CreateImm(0));
+          .add(FalseValue);
       (*MI)->eraseFromParent();
       // Setting MI to the erase result keeps the iterator valid and increased.
       MI = BIL.erase(MI);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index bc9957194f6dd..402e29cdff726 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -36,7 +37,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 //===----------------------------------------------------------------------===//
@@ -1930,7 +1930,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
 
   PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
-  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+  if (CModel == CodeModel::Small) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
             TmpReg)
       .addConstantPoolIndex(Idx).addReg(PPC::X2);
@@ -1981,7 +1981,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
 
   PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a simple TOC load.
-  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+  if (CModel == CodeModel::Small)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
             DestReg)
         .addGlobalAddress(GV)
@@ -1991,9 +1991,9 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // or externally available linkage, a non-local function address, or a
     // jump table address (not yet needed), or if we are generating code
     // for large code model, we generate:
-    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    //       LDtocL(GV, ADDIStocHA(%x2, GV))
     // Otherwise we generate:
-    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    //       ADDItocL(ADDIStocHA(%x2, GV), GV)
     // Either way, start with the ADDIStocHA:
     unsigned HighPartReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index b49c3345a17dd..c870a2256691e 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -312,11 +312,9 @@ static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
 
   // Live in and live out values already must be in the mask, so don't bother
   // marking them.
-  for (MachineRegisterInfo::livein_iterator
-       I = MF->getRegInfo().livein_begin(),
-       E = MF->getRegInfo().livein_end(); I != E; ++I) {
-    unsigned RegNo = TRI->getEncodingValue(I->first);
-    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+  for (std::pair<unsigned, unsigned> LI : MF->getRegInfo().liveins()) {
+    unsigned RegNo = TRI->getEncodingValue(LI.first);
+    if (VRRegNo[RegNo] == LI.first)        // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
 
@@ -436,7 +434,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   unsigned LR = RegInfo->getRARegister();
-  bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+  bool DisableRedZone = MF.getFunction().hasFnAttribute(Attribute::NoRedZone);
   bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
                        !MFI.adjustsStack() &&       // No calls.
                        !MustSaveLR(MF, LR) &&       // No need to save LR.
@@ -501,7 +499,7 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+  if (MF.getFunction().hasFnAttribute(Attribute::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -694,7 +692,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
   bool needsCFI = MMI.hasDebugInfo() ||
-    MF.getFunction()->needsUnwindTableEntry();
+    MF.getFunction().needsUnwindTableEntry();
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
@@ -1507,7 +1505,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     unsigned RetOpcode = MBBI->getOpcode();
     if (MF.getTarget().Options.GuaranteedTailCallOpt &&
         (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
-        MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+        MF.getFunction().getCallingConv() == CallingConv::Fast) {
       PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
       unsigned CallerAllocatedAmt = FI->getMinReservedArea();
 
@@ -2067,7 +2065,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 bool
 PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        std::vector<CalleeSavedInfo> &CSI,
                                         const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 28b0c57f0ffb5..f845d5a9ac64a 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -15,7 +15,7 @@
 
 #include "PPC.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -106,7 +106,7 @@ public:
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
-                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  std::vector<CalleeSavedInfo> &CSI,
                                   const TargetRegisterInfo *TRI) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 901539b682baa..d3a223fe03e0f 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -36,6 +36,8 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugLoc.h"
@@ -53,8 +55,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -101,6 +101,29 @@ static cl::opt<bool> EnableBranchHint(
     cl::desc("Enable static hinting of branches on ppc"),
     cl::Hidden);
 
+enum ICmpInGPRType { ICGPR_All, ICGPR_None, ICGPR_I32, ICGPR_I64,
+  ICGPR_NonExtIn, ICGPR_Zext, ICGPR_Sext, ICGPR_ZextI32,
+  ICGPR_SextI32, ICGPR_ZextI64, ICGPR_SextI64 };
+
+static cl::opt<ICmpInGPRType> CmpInGPR(
+  "ppc-gpr-icmps", cl::Hidden, cl::init(ICGPR_All),
+  cl::desc("Specify the types of comparisons to emit GPR-only code for."),
+  cl::values(clEnumValN(ICGPR_None, "none", "Do not modify integer comparisons."),
+             clEnumValN(ICGPR_All, "all", "All possible int comparisons in GPRs."),
+             clEnumValN(ICGPR_I32, "i32", "Only i32 comparisons in GPRs."),
+             clEnumValN(ICGPR_I64, "i64", "Only i64 comparisons in GPRs."),
+             clEnumValN(ICGPR_NonExtIn, "nonextin",
+                        "Only comparisons where inputs don't need [sz]ext."),
+             clEnumValN(ICGPR_Zext, "zext", "Only comparisons with zext result."),
+             clEnumValN(ICGPR_ZextI32, "zexti32",
+                        "Only i32 comparisons with zext result."),
+             clEnumValN(ICGPR_ZextI64, "zexti64",
+                        "Only i64 comparisons with zext result."),
+             clEnumValN(ICGPR_Sext, "sext", "Only comparisons with sext result."),
+             clEnumValN(ICGPR_SextI32, "sexti32",
+                        "Only i32 comparisons with sext result."),
+             clEnumValN(ICGPR_SextI64, "sexti64",
+                        "Only i64 comparisons with sext result.")));
 namespace {
 
   //===--------------------------------------------------------------------===//
@@ -133,6 +156,12 @@ namespace {
     void PreprocessISelDAG() override;
     void PostprocessISelDAG() override;
 
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDValue getI16Imm(unsigned Imm, const SDLoc &dl) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i16);
+    }
+
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
     inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
@@ -168,6 +197,7 @@ namespace {
 
     bool tryBitfieldInsert(SDNode *N);
     bool tryBitPermutation(SDNode *N);
+    bool tryIntCompareInGPR(SDNode *N);
 
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
@@ -270,34 +300,7 @@ namespace {
 #include "PPCGenDAGISel.inc"
 
 private:
-    // Conversion type for interpreting results of a 32-bit instruction as
-    // a 64-bit value or vice versa.
-    enum ExtOrTruncConversion { Ext, Trunc };
-
-    // Modifiers to guide how an ISD::SETCC node's result is to be computed
-    // in a GPR.
-    // ZExtOrig - use the original condition code, zero-extend value
-    // ZExtInvert - invert the condition code, zero-extend value
-    // SExtOrig - use the original condition code, sign-extend value
-    // SExtInvert - invert the condition code, sign-extend value
-    enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
-
     bool trySETCC(SDNode *N);
-    bool tryEXTEND(SDNode *N);
-    bool tryLogicOpOfCompares(SDNode *N);
-    SDValue computeLogicOpInGPR(SDValue LogicOp);
-    SDValue signExtendInputIfNeeded(SDValue Input);
-    SDValue zeroExtendInputIfNeeded(SDValue Input);
-    SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
-    SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                int64_t RHSValue, SDLoc dl);
-    SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                int64_t RHSValue, SDLoc dl);
-    SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                int64_t RHSValue, SDLoc dl);
-    SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                int64_t RHSValue, SDLoc dl);
-    SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -388,7 +391,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     // Insert the set of GlobalBaseReg into the first MBB of the function
     MachineBasicBlock &FirstMBB = MF->front();
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-    const Module *M = MF->getFunction()->getParent();
+    const Module *M = MF->getFunction().getParent();
     DebugLoc dl;
 
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
@@ -450,6 +453,12 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) {
   return isInt32Immediate(N.getNode(), Imm);
 }
 
+/// isInt64Immediate - This method tests to see if the value is a 64-bit
+/// constant operand. If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDValue N, uint64_t &Imm) {
+  return isInt64Immediate(N.getNode(), Imm);
+}
+
 static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
                               const SDValue &DestMBB) {
   assert(isa<BasicBlockSDNode>(DestMBB));
@@ -607,8 +616,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
 
     unsigned MB, ME;
     if (isRunOfOnes(InsertMask, MB, ME)) {
-      SDValue Tmp1, Tmp2;
-
       if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
           isInt32Immediate(Op1.getOperand(1), Value)) {
         Op1 = Op1.getOperand(0);
@@ -643,8 +650,8 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
 }
 
 // Predict the number of instructions that would be generated by calling
-// getInt64(N).
-static unsigned getInt64CountDirect(int64_t Imm) {
+// selectI64Imm(N).
+static unsigned selectI64ImmInstrCountDirect(int64_t Imm) {
   // Assume no remaining bits.
   unsigned Remainder = 0;
   // Assume no shift required.
@@ -712,8 +719,8 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) {
   return (Imm << R) | (Imm >> (64 - R));
 }
 
-static unsigned getInt64Count(int64_t Imm) {
-  unsigned Count = getInt64CountDirect(Imm);
+static unsigned selectI64ImmInstrCount(int64_t Imm) {
+  unsigned Count = selectI64ImmInstrCountDirect(Imm);
 
   // If the instruction count is 1 or 2, we do not need further analysis
   // since rotate + load constant requires at least 2 instructions.
@@ -722,10 +729,10 @@ static unsigned getInt64Count(int64_t Imm) {
 
   for (unsigned r = 1; r < 63; ++r) {
     uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = getInt64CountDirect(RImm) + 1;
+    unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
     Count = std::min(Count, RCount);
 
-    // See comments in getInt64 for an explanation of the logic below.
+    // See comments in selectI64Imm for an explanation of the logic below.
     unsigned LS = findLastSet(RImm);
     if (LS != r-1)
       continue;
@@ -733,17 +740,17 @@ static unsigned getInt64Count(int64_t Imm) {
     uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
     uint64_t RImmWithOnes = RImm | OnesMask;
 
-    RCount = getInt64CountDirect(RImmWithOnes) + 1;
+    RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
     Count = std::min(Count, RCount);
   }
 
   return Count;
 }
 
-// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
+// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount
 // (above) needs to be kept in sync with this function.
-static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
-                              int64_t Imm) {
+static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl,
+                                  int64_t Imm) {
   // Assume no remaining bits.
   unsigned Remainder = 0;
   // Assume no shift required.
@@ -779,8 +786,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
 
   // Simple value.
   if (isInt<16>(Imm)) {
+    uint64_t SextImm = SignExtend64(Lo, 16);
+    SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
     // Just the Lo bits.
-    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
   } else if (Lo) {
     // Handle the Hi bits.
     unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
@@ -825,13 +834,14 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
   return Result;
 }
 
-static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
-  unsigned Count = getInt64CountDirect(Imm);
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl,
+                            int64_t Imm) {
+  unsigned Count = selectI64ImmInstrCountDirect(Imm);
 
   // If the instruction count is 1 or 2, we do not need further analysis
   // since rotate + load constant requires at least 2 instructions.
   if (Count <= 2)
-    return getInt64Direct(CurDAG, dl, Imm);
+    return selectI64ImmDirect(CurDAG, dl, Imm);
 
   unsigned RMin = 0;
 
@@ -840,7 +850,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
 
   for (unsigned r = 1; r < 63; ++r) {
     uint64_t RImm = Rot64(Imm, r);
-    unsigned RCount = getInt64CountDirect(RImm) + 1;
+    unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1;
     if (RCount < Count) {
       Count = RCount;
       RMin = r;
@@ -863,7 +873,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
     uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
     uint64_t RImmWithOnes = RImm | OnesMask;
 
-    RCount = getInt64CountDirect(RImmWithOnes) + 1;
+    RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1;
     if (RCount < Count) {
       Count = RCount;
       RMin = r;
@@ -873,24 +883,86 @@ static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
   }
 
   if (!RMin)
-    return getInt64Direct(CurDAG, dl, Imm);
+    return selectI64ImmDirect(CurDAG, dl, Imm);
 
   auto getI32Imm = [CurDAG, dl](unsigned Imm) {
       return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   };
 
-  SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
+  SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0);
   return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
                                 getI32Imm(64 - RMin), getI32Imm(MaskEnd));
 }
 
+static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
+  unsigned MaxTruncation = 0;
+  // Cannot use range-based for loop here as we need the actual use (i.e. we
+  // need the operand number corresponding to the use). A range-based for
+  // will unbox the use and provide an SDNode*.
+  for (SDNode::use_iterator Use = N->use_begin(), UseEnd = N->use_end();
+       Use != UseEnd; ++Use) {
+    unsigned Opc =
+      Use->isMachineOpcode() ? Use->getMachineOpcode() : Use->getOpcode();
+    switch (Opc) {
+    default: return 0;
+    case ISD::TRUNCATE:
+      if (Use->isMachineOpcode())
+        return 0;
+      MaxTruncation =
+        std::max(MaxTruncation, Use->getValueType(0).getSizeInBits());
+      continue;
+    case ISD::STORE: {
+      if (Use->isMachineOpcode())
+        return 0;
+      StoreSDNode *STN = cast<StoreSDNode>(*Use);
+      unsigned MemVTSize = STN->getMemoryVT().getSizeInBits();
+      if (MemVTSize == 64 || Use.getOperandNo() != 0)
+        return 0;
+      MaxTruncation = std::max(MaxTruncation, MemVTSize);
+      continue;
+    }
+    case PPC::STW8:
+    case PPC::STWX8:
+    case PPC::STWU8:
+    case PPC::STWUX8:
+      if (Use.getOperandNo() != 0)
+        return 0;
+      MaxTruncation = std::max(MaxTruncation, 32u);
+      continue;
+    case PPC::STH8:
+    case PPC::STHX8:
+    case PPC::STHU8:
+    case PPC::STHUX8:
+      if (Use.getOperandNo() != 0)
+        return 0;
+      MaxTruncation = std::max(MaxTruncation, 16u);
+      continue;
+    case PPC::STB8:
+    case PPC::STBX8:
+    case PPC::STBU8:
+    case PPC::STBUX8:
+      if (Use.getOperandNo() != 0)
+        return 0;
+      MaxTruncation = std::max(MaxTruncation, 8u);
+      continue;
+    }
+  }
+  return MaxTruncation;
+}
+
 // Select a 64-bit constant.
-static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
+static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {
   SDLoc dl(N);
 
   // Get 64 bit value.
   int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
-  return getInt64(CurDAG, dl, Imm);
+  if (unsigned MinSize = allUsesTruncate(CurDAG, N)) {
+    uint64_t SextImm = SignExtend64(Imm, MinSize);
+    SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
+    if (isInt<16>(SextImm))
+      return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm);
+  }
+  return selectI64Imm(CurDAG, dl, Imm);
 }
 
 namespace {
@@ -1090,6 +1162,25 @@ class BitPermutationSelector {
 
       return std::make_pair(Interesting = true, &Bits);
     }
+    case ISD::ZERO_EXTEND: {
+      // We support only the case with zero extension from i32 to i64 so far.
+      if (V.getValueType() != MVT::i64 ||
+          V.getOperand(0).getValueType() != MVT::i32)
+        break;
+
+      const SmallVector<ValueBit, 64> *LHSBits;
+      const unsigned NumOperandBits = 32;
+      std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
+                                                    NumOperandBits);
+
+      for (unsigned i = 0; i < NumOperandBits; ++i)
+        Bits[i] = (*LHSBits)[i];
+
+      for (unsigned i = NumOperandBits; i < NumBits; ++i)
+        Bits[i] = ValueBit(ValueBit::ConstZero);
+
+      return std::make_pair(Interesting, &Bits);
+      }
     }
 
     for (unsigned i = 0; i < NumBits; ++i)
@@ -1351,6 +1442,24 @@ class BitPermutationSelector {
     return ~Mask;
   }
 
+  // This method extends an input value to 64 bit if input is 32-bit integer.
+  // While selecting instructions in BitPermutationSelector in 64-bit mode,
+  // an input value can be a 32-bit integer if a ZERO_EXTEND node is included.
+  // In such case, we extend it to 64 bit to be consistent with other values.
+  SDValue ExtendToInt64(SDValue V, const SDLoc &dl) {
+    if (V.getValueSizeInBits() == 64)
+      return V;
+
+    assert(V.getValueSizeInBits() == 32);
+    SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+    SDValue ImDef = SDValue(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
+                                                   MVT::i64), 0);
+    SDValue ExtVal = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
+                                                    MVT::i64, ImDef, V,
+                                                    SubRegIdx), 0);
+    return ExtVal;
+  }
+
   // Depending on the number of groups for a particular value, it might be
   // better to rotate, mask explicitly (using andi/andis), and then or the
   // result. Select this part of the result first.
@@ -1567,27 +1676,30 @@ class BitPermutationSelector {
       assert(InstMaskStart >= 32 && "Mask cannot start out of range");
       assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
       SDValue Ops[] =
-        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
-          getI32Imm(InstMaskEnd - 32, dl) };
+        { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
                                             Ops), 0);
     }
 
     if (InstMaskEnd == 63) {
       SDValue Ops[] =
-        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+        { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskStart, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
     }
 
     if (InstMaskStart == 0) {
       SDValue Ops[] =
-        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) };
+        { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskEnd, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
     }
 
     if (InstMaskEnd == 63 - RLAmt) {
       SDValue Ops[] =
-        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+        { ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskStart, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
     }
 
@@ -1628,15 +1740,16 @@ class BitPermutationSelector {
       assert(InstMaskStart >= 32 && "Mask cannot start out of range");
       assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
       SDValue Ops[] =
-        { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
-          getI32Imm(InstMaskEnd - 32, dl) };
+        { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskStart - 32, dl), getI32Imm(InstMaskEnd - 32, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
                                             Ops), 0);
     }
 
     if (InstMaskEnd == 63 - RLAmt) {
       SDValue Ops[] =
-        { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+        { ExtendToInt64(Base, dl), ExtendToInt64(V, dl), getI32Imm(RLAmt, dl),
+          getI32Imm(InstMaskStart, dl) };
       return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
     }
 
@@ -1730,7 +1843,7 @@ class BitPermutationSelector {
         NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
                        (unsigned) (ANDIMask != 0 && ANDISMask != 0);
       else
-        NumAndInsts += getInt64Count(Mask) + /* and */ 1;
+        NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1;
 
       unsigned NumRLInsts = 0;
       bool FirstBG = true;
@@ -1786,10 +1899,14 @@ class BitPermutationSelector {
         SDValue ANDIVal, ANDISVal;
         if (ANDIMask != 0)
           ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
-                              VRot, getI32Imm(ANDIMask, dl)), 0);
+                                                   ExtendToInt64(VRot, dl),
+                                                   getI32Imm(ANDIMask, dl)),
+                            0);
         if (ANDISMask != 0)
           ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
-                               VRot, getI32Imm(ANDISMask, dl)), 0);
+                                                    ExtendToInt64(VRot, dl),
+                                                    getI32Imm(ANDISMask, dl)),
+                             0);
 
         if (!ANDIVal)
           TotalVal = ANDISVal;
@@ -1797,19 +1914,21 @@ class BitPermutationSelector {
           TotalVal = ANDIVal;
         else
           TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
-                               ANDIVal, ANDISVal), 0);
+                               ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
       } else {
-        TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+        TotalVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
         TotalVal =
           SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
-                                         VRot, TotalVal), 0);
+                                         ExtendToInt64(VRot, dl), TotalVal),
+                  0);
      }
 
       if (!Res)
         Res = TotalVal;
       else
         Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
-                                             Res, TotalVal), 0);
+                                             ExtendToInt64(Res, dl), TotalVal),
+                      0);
 
       // Now, remove all groups with this underlying value and rotation
       // factor.
@@ -1929,10 +2048,10 @@ class BitPermutationSelector {
         SDValue ANDIVal, ANDISVal;
         if (ANDIMask != 0)
           ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
-                              Res, getI32Imm(ANDIMask, dl)), 0);
+                              ExtendToInt64(Res, dl), getI32Imm(ANDIMask, dl)), 0);
         if (ANDISMask != 0)
           ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
-                               Res, getI32Imm(ANDISMask, dl)), 0);
+                               ExtendToInt64(Res, dl), getI32Imm(ANDISMask, dl)), 0);
 
         if (!ANDIVal)
           Res = ANDISVal;
@@ -1940,14 +2059,14 @@ class BitPermutationSelector {
           Res = ANDIVal;
         else
           Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
-                          ANDIVal, ANDISVal), 0);
+                          ExtendToInt64(ANDIVal, dl), ANDISVal), 0);
       } else {
-        if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
+        if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1;
 
-        SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+        SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0);
         Res =
           SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
-                                         Res, MaskVal), 0);
+                                         ExtendToInt64(Res, dl), MaskVal), 0);
       }
     }
 
@@ -2046,962 +2165,1658 @@ public:
   }
 };
 
-} // end anonymous namespace
-
-bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
-  if (N->getValueType(0) != MVT::i32 &&
-      N->getValueType(0) != MVT::i64)
-    return false;
-
-  if (!UseBitPermRewriter)
-    return false;
+class IntegerCompareEliminator {
+  SelectionDAG *CurDAG;
+  PPCDAGToDAGISel *S;
+  // Conversion type for interpreting results of a 32-bit instruction as
+  // a 64-bit value or vice versa.
+  enum ExtOrTruncConversion { Ext, Trunc };
+
+  // Modifiers to guide how an ISD::SETCC node's result is to be computed
+  // in a GPR.
+  // ZExtOrig - use the original condition code, zero-extend value
+  // ZExtInvert - invert the condition code, zero-extend value
+  // SExtOrig - use the original condition code, sign-extend value
+  // SExtInvert - invert the condition code, sign-extend value
+  enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
+  // Comparisons against zero to emit GPR code sequences for. Each of these
+  // sequences may need to be emitted for two or more equivalent patterns.
+  // For example (a >= 0) == (a > -1). The direction of the comparison (</>)
+  // matters as well as the extension type: sext (-1/0), zext (1/0).
+  // GEZExt - (zext (LHS >= 0))
+  // GESExt - (sext (LHS >= 0))
+  // LEZExt - (zext (LHS <= 0))
+  // LESExt - (sext (LHS <= 0))
+  enum ZeroCompare { GEZExt, GESExt, LEZExt, LESExt };
+
+  SDNode *tryEXTEND(SDNode *N);
+  SDNode *tryLogicOpOfCompares(SDNode *N);
+  SDValue computeLogicOpInGPR(SDValue LogicOp);
+  SDValue signExtendInputIfNeeded(SDValue Input);
+  SDValue zeroExtendInputIfNeeded(SDValue Input);
+  SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+  SDValue getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl,
+                                        ZeroCompare CmpTy);
+  SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              int64_t RHSValue, SDLoc dl);
+ SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              int64_t RHSValue, SDLoc dl);
+  SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              int64_t RHSValue, SDLoc dl);
+  SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              int64_t RHSValue, SDLoc dl);
+  SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
-  switch (N->getOpcode()) {
-  default: break;
-  case ISD::ROTL:
-  case ISD::SHL:
-  case ISD::SRL:
-  case ISD::AND:
-  case ISD::OR: {
-    BitPermutationSelector BPS(CurDAG);
-    if (SDNode *New = BPS.Select(N)) {
-      ReplaceNode(N, New);
-      return true;
-    }
-    return false;
+public:
+  IntegerCompareEliminator(SelectionDAG *DAG,
+                           PPCDAGToDAGISel *Sel) : CurDAG(DAG), S(Sel) {
+    assert(CurDAG->getTargetLoweringInfo()
+           .getPointerTy(CurDAG->getDataLayout()).getSizeInBits() == 64 &&
+           "Only expecting to use this on 64 bit targets.");
   }
+  SDNode *Select(SDNode *N) {
+    if (CmpInGPR == ICGPR_None)
+      return nullptr;
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::ZERO_EXTEND:
+      if (CmpInGPR == ICGPR_Sext || CmpInGPR == ICGPR_SextI32 ||
+          CmpInGPR == ICGPR_SextI64)
+        return nullptr;
+      LLVM_FALLTHROUGH;
+    case ISD::SIGN_EXTEND:
+      if (CmpInGPR == ICGPR_Zext || CmpInGPR == ICGPR_ZextI32 ||
+          CmpInGPR == ICGPR_ZextI64)
+        return nullptr;
+      return tryEXTEND(N);
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      return tryLogicOpOfCompares(N);
+    }
+    return nullptr;
   }
+};
 
-  return false;
+static bool isLogicOp(unsigned Opc) {
+  return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
 }
+// The obvious case for wanting to keep the value in a GPR. Namely, the
+// result of the comparison is actually needed in a GPR.
+SDNode *IntegerCompareEliminator::tryEXTEND(SDNode *N) {
+  assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) &&
+         "Expecting a zero/sign extend node!");
+  SDValue WideRes;
+  // If we are zero-extending the result of a logical operation on i1
+  // values, we can keep the values in GPRs.
+  if (isLogicOp(N->getOperand(0).getOpcode()) &&
+      N->getOperand(0).getValueType() == MVT::i1 &&
+      N->getOpcode() == ISD::ZERO_EXTEND)
+    WideRes = computeLogicOpInGPR(N->getOperand(0));
+  else if (N->getOperand(0).getOpcode() != ISD::SETCC)
+    return nullptr;
+  else
+    WideRes =
+      getSETCCInGPR(N->getOperand(0),
+                    N->getOpcode() == ISD::SIGN_EXTEND ?
+                    SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
 
-/// SelectCC - Select a comparison of the specified values with the specified
-/// condition code, returning the CR# of the expression.
-SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                                  const SDLoc &dl) {
-  // Always select the LHS.
-  unsigned Opc;
+  if (!WideRes)
+    return nullptr;
 
-  if (LHS.getValueType() == MVT::i32) {
-    unsigned Imm;
-    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
-      if (isInt32Immediate(RHS, Imm)) {
-        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt<16>(Imm))
-          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
-                                                getI32Imm(Imm & 0xFFFF, dl)),
-                         0);
-        // If this is a 16-bit signed immediate, fold it.
-        if (isInt<16>((int)Imm))
-          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
-                                                getI32Imm(Imm & 0xFFFF, dl)),
-                         0);
+  SDLoc dl(N);
+  bool Input32Bit = WideRes.getValueType() == MVT::i32;
+  bool Output32Bit = N->getValueType(0) == MVT::i32;
 
-        // For non-equality comparisons, the default code would materialize the
-        // constant, then compare against it, like this:
-        //   lis r2, 4660
-        //   ori r2, r2, 22136
-        //   cmpw cr0, r3, r2
-        // Since we are just comparing for equality, we can emit this instead:
-        //   xoris r0,r3,0x1234
-        //   cmplwi cr0,r0,0x5678
-        //   beq cr0,L6
-        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
-                                           getI32Imm(Imm >> 16, dl)), 0);
-        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
-                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
-      }
-      Opc = PPC::CMPLW;
-    } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
-        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
-                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
-      Opc = PPC::CMPLW;
-    } else {
-      int16_t SImm;
-      if (isIntS16Immediate(RHS, SImm))
-        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
-                                              getI32Imm((int)SImm & 0xFFFF,
-                                                        dl)),
-                         0);
-      Opc = PPC::CMPW;
-    }
-  } else if (LHS.getValueType() == MVT::i64) {
-    uint64_t Imm;
-    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
-      if (isInt64Immediate(RHS.getNode(), Imm)) {
-        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
-        if (isUInt<16>(Imm))
-          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
-                                                getI32Imm(Imm & 0xFFFF, dl)),
-                         0);
-        // If this is a 16-bit signed immediate, fold it.
-        if (isInt<16>(Imm))
-          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
-                                                getI32Imm(Imm & 0xFFFF, dl)),
-                         0);
+  NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+  NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
 
-        // For non-equality comparisons, the default code would materialize the
-        // constant, then compare against it, like this:
-        //   lis r2, 4660
-        //   ori r2, r2, 22136
-        //   cmpd cr0, r3, r2
-        // Since we are just comparing for equality, we can emit this instead:
-        //   xoris r0,r3,0x1234
-        //   cmpldi cr0,r0,0x5678
-        //   beq cr0,L6
-        if (isUInt<32>(Imm)) {
-          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
-                                             getI64Imm(Imm >> 16, dl)), 0);
-          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
-                                                getI64Imm(Imm & 0xFFFF, dl)),
-                         0);
-        }
-      }
-      Opc = PPC::CMPLD;
-    } else if (ISD::isUnsignedIntSetCC(CC)) {
-      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
-        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
-                                              getI64Imm(Imm & 0xFFFF, dl)), 0);
-      Opc = PPC::CMPLD;
-    } else {
-      int16_t SImm;
-      if (isIntS16Immediate(RHS, SImm))
-        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
-                                              getI64Imm(SImm & 0xFFFF, dl)),
-                         0);
-      Opc = PPC::CMPD;
-    }
-  } else if (LHS.getValueType() == MVT::f32) {
-    Opc = PPC::FCMPUS;
+  SDValue ConvOp = WideRes;
+  if (Input32Bit != Output32Bit)
+    ConvOp = addExtOrTrunc(WideRes, Input32Bit ? ExtOrTruncConversion::Ext :
+                           ExtOrTruncConversion::Trunc);
+  return ConvOp.getNode();
+}
+
+// Attempt to perform logical operations on the results of comparisons while
+// keeping the values in GPRs. Without doing so, these would end up being
+// lowered to CR-logical operations which suffer from significant latency and
+// low ILP.
+SDNode *IntegerCompareEliminator::tryLogicOpOfCompares(SDNode *N) {
+  if (N->getValueType(0) != MVT::i1)
+    return nullptr;
+  assert(isLogicOp(N->getOpcode()) &&
+         "Expected a logic operation on setcc results.");
+  SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
+  if (!LoweredLogical)
+    return nullptr;
+
+  SDLoc dl(N);
+  bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
+  unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
+  SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+  SDValue LHS = LoweredLogical.getOperand(0);
+  SDValue RHS = LoweredLogical.getOperand(1);
+  SDValue WideOp;
+  SDValue OpToConvToRecForm;
+
+  // Look through any 32-bit to 64-bit implicit extend nodes to find the
+  // opcode that is input to the XORI.
+  if (IsBitwiseNegate &&
+      LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
+    OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
+  else if (IsBitwiseNegate)
+    // If the input to the XORI isn't an extension, that's what we're after.
+    OpToConvToRecForm = LoweredLogical.getOperand(0);
+  else
+    // If this is not an XORI, it is a reg-reg logical op and we can convert
+    // it to record-form.
+    OpToConvToRecForm = LoweredLogical;
+
+  // Get the record-form version of the node we're looking to use to get the
+  // CR result from.
+  uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
+  int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
+
+  // Convert the right node to record-form. This is either the logical we're
+  // looking at or it is the input node to the negation (if we're looking at
+  // a bitwise negation).
+  if (NewOpc != -1 && IsBitwiseNegate) {
+    // The input to the XORI has a record-form. Use it.
+    assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
+           "Expected a PPC::XORI8 only for bitwise negation.");
+    // Emit the record-form instruction.
+    std::vector<SDValue> Ops;
+    for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
+      Ops.push_back(OpToConvToRecForm.getOperand(i));
+
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc, dl,
+                                     OpToConvToRecForm.getValueType(),
+                                     MVT::Glue, Ops), 0);
   } else {
-    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+    assert((NewOpc != -1 || !IsBitwiseNegate) &&
+           "No record form available for AND8/OR8/XOR8?");
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
+                                     MVT::i64, MVT::Glue, LHS, RHS), 0);
   }
-  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+
+  // Select this node to a single bit from CR0 set by the record-form node
+  // just created. For bitwise negation, use the EQ bit which is the equivalent
+  // of negating the result (i.e. it is a bit set when the result of the
+  // operation is zero).
+  SDValue SRIdxVal =
+    CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
+  SDValue CRBit =
+    SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                   MVT::i1, CR0Reg, SRIdxVal,
+                                   WideOp.getValue(1)), 0);
+  return CRBit.getNode();
 }
 
-static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
-  switch (CC) {
-  case ISD::SETUEQ:
-  case ISD::SETONE:
-  case ISD::SETOLE:
-  case ISD::SETOGE:
-    llvm_unreachable("Should be lowered by legalize!");
-  default: llvm_unreachable("Unknown condition!");
-  case ISD::SETOEQ:
-  case ISD::SETEQ:  return PPC::PRED_EQ;
-  case ISD::SETUNE:
-  case ISD::SETNE:  return PPC::PRED_NE;
-  case ISD::SETOLT:
-  case ISD::SETLT:  return PPC::PRED_LT;
-  case ISD::SETULE:
-  case ISD::SETLE:  return PPC::PRED_LE;
-  case ISD::SETOGT:
-  case ISD::SETGT:  return PPC::PRED_GT;
-  case ISD::SETUGE:
-  case ISD::SETGE:  return PPC::PRED_GE;
-  case ISD::SETO:   return PPC::PRED_NU;
-  case ISD::SETUO:  return PPC::PRED_UN;
-    // These two are invalid for floating point.  Assume we have int.
-  case ISD::SETULT: return PPC::PRED_LT;
-  case ISD::SETUGT: return PPC::PRED_GT;
+// Lower a logical operation on i1 values into a GPR sequence if possible.
+// The result can be kept in a GPR if requested.
+// Three types of inputs can be handled:
+// - SETCC
+// - TRUNCATE
+// - Logical operation (AND/OR/XOR)
+// There is also a special case that is handled (namely a complement operation
+// achieved with xor %a, -1).
+SDValue IntegerCompareEliminator::computeLogicOpInGPR(SDValue LogicOp) {
+  assert(isLogicOp(LogicOp.getOpcode()) &&
+        "Can only handle logic operations here.");
+  assert(LogicOp.getValueType() == MVT::i1 &&
+         "Can only handle logic operations on i1 values here.");
+  SDLoc dl(LogicOp);
+  SDValue LHS, RHS;
+
+ // Special case: xor %a, -1
+  bool IsBitwiseNegation = isBitwiseNot(LogicOp);
+
+  // Produces a GPR sequence for each operand of the binary logic operation.
+  // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
+  // the value in a GPR and for logic operations, it will recursively produce
+  // a GPR sequence for the operation.
+ auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
+    unsigned OperandOpcode = Operand.getOpcode();
+    if (OperandOpcode == ISD::SETCC)
+      return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
+    else if (OperandOpcode == ISD::TRUNCATE) {
+      SDValue InputOp = Operand.getOperand(0);
+     EVT InVT = InputOp.getValueType();
+      return SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
+                                            PPC::RLDICL, dl, InVT, InputOp,
+                                            S->getI64Imm(0, dl),
+                                            S->getI64Imm(63, dl)), 0);
+    } else if (isLogicOp(OperandOpcode))
+      return computeLogicOpInGPR(Operand);
+    return SDValue();
+  };
+  LHS = getLogicOperand(LogicOp.getOperand(0));
+  RHS = getLogicOperand(LogicOp.getOperand(1));
+
+  // If a GPR sequence can't be produced for the LHS we can't proceed.
+  // Not producing a GPR sequence for the RHS is only a problem if this isn't
+  // a bitwise negation operation.
+  if (!LHS || (!RHS && !IsBitwiseNegation))
+    return SDValue();
+
+  NumLogicOpsOnComparison++;
+
+  // We will use the inputs as 64-bit values.
+  if (LHS.getValueType() == MVT::i32)
+    LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
+  if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
+    RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
+
+  unsigned NewOpc;
+  switch (LogicOp.getOpcode()) {
+  default: llvm_unreachable("Unknown logic operation.");
+  case ISD::AND: NewOpc = PPC::AND8; break;
+  case ISD::OR:  NewOpc = PPC::OR8;  break;
+  case ISD::XOR: NewOpc = PPC::XOR8; break;
   }
-}
 
-/// getCRIdxForSetCC - Return the index of the condition register field
-/// associated with the SetCC condition, and whether or not the field is
-/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
-static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
-  Invert = false;
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition!");
-  case ISD::SETOLT:
-  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
-  case ISD::SETOGT:
-  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
-  case ISD::SETOEQ:
-  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
-  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
-  case ISD::SETUGE:
-  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
-  case ISD::SETULE:
-  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
-  case ISD::SETUNE:
-  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
-  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
-  case ISD::SETUEQ:
-  case ISD::SETOGE:
-  case ISD::SETOLE:
-  case ISD::SETONE:
-    llvm_unreachable("Invalid branch code: should be expanded by legalize");
-  // These are invalid for floating point.  Assume integer.
-  case ISD::SETULT: return 0;
-  case ISD::SETUGT: return 1;
+  if (IsBitwiseNegation) {
+    RHS = S->getI64Imm(1, dl);
+    NewOpc = PPC::XORI8;
   }
+
+  return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
+
 }
 
-// getVCmpInst: return the vector compare instruction for the specified
-// vector type and condition code. Since this is for altivec specific code,
-// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
-static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
-                                bool HasVSX, bool &Swap, bool &Negate) {
-  Swap = false;
-  Negate = false;
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Otherwise just reinterpret it as a 64-bit value.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue IntegerCompareEliminator::signExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only sign-extend 32-bit values here.");
+  unsigned Opc = Input.getOpcode();
 
-  if (VecVT.isFloatingPoint()) {
-    /* Handle some cases by swapping input operands.  */
-    switch (CC) {
-      case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
-      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
-      case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
-      case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
-      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
-      case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
-      default: break;
-    }
-    /* Handle some cases by negating the result.  */
-    switch (CC) {
-      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
-      case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
-      case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
-      case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
-      default: break;
-    }
-    /* We have instructions implementing the remaining cases.  */
-    switch (CC) {
-      case ISD::SETEQ:
-      case ISD::SETOEQ:
-        if (VecVT == MVT::v4f32)
-          return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
-        else if (VecVT == MVT::v2f64)
-          return PPC::XVCMPEQDP;
-        break;
-      case ISD::SETGT:
-      case ISD::SETOGT:
-        if (VecVT == MVT::v4f32)
-          return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
-        else if (VecVT == MVT::v2f64)
-          return PPC::XVCMPGTDP;
-        break;
-      case ISD::SETGE:
-      case ISD::SETOGE:
-        if (VecVT == MVT::v4f32)
-          return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
-        else if (VecVT == MVT::v2f64)
-          return PPC::XVCMPGEDP;
-        break;
-      default:
-        break;
-    }
-    llvm_unreachable("Invalid floating-point vector compare condition");
-  } else {
-    /* Handle some cases by swapping input operands.  */
-    switch (CC) {
-      case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
-      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
-      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
-      case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
-      default: break;
-    }
-    /* Handle some cases by negating the result.  */
-    switch (CC) {
-      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
-      case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
-      case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
-      case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
-      default: break;
-    }
-    /* We have instructions implementing the remaining cases.  */
-    switch (CC) {
-      case ISD::SETEQ:
-      case ISD::SETUEQ:
-        if (VecVT == MVT::v16i8)
-          return PPC::VCMPEQUB;
-        else if (VecVT == MVT::v8i16)
-          return PPC::VCMPEQUH;
-        else if (VecVT == MVT::v4i32)
-          return PPC::VCMPEQUW;
-        else if (VecVT == MVT::v2i64)
-          return PPC::VCMPEQUD;
-        break;
-      case ISD::SETGT:
-        if (VecVT == MVT::v16i8)
-          return PPC::VCMPGTSB;
-        else if (VecVT == MVT::v8i16)
-          return PPC::VCMPGTSH;
-        else if (VecVT == MVT::v4i32)
-          return PPC::VCMPGTSW;
-        else if (VecVT == MVT::v2i64)
-          return PPC::VCMPGTSD;
-        break;
-      case ISD::SETUGT:
-        if (VecVT == MVT::v16i8)
-          return PPC::VCMPGTUB;
-        else if (VecVT == MVT::v8i16)
-          return PPC::VCMPGTUH;
-        else if (VecVT == MVT::v4i32)
-          return PPC::VCMPGTUW;
-        else if (VecVT == MVT::v2i64)
-          return PPC::VCMPGTUD;
-        break;
-      default:
-        break;
-    }
-    llvm_unreachable("Invalid integer vector compare condition");
-  }
+  // The value was sign extended and then truncated to 32-bits. No need to
+  // sign extend it again.
+  if (Opc == ISD::TRUNCATE &&
+      (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+       Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  // The input is a sign-extending load. All ppc sign-extending loads
+  // sign-extend to the full 64-bits.
+  if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  // We don't sign-extend constants.
+  if (InputConst)
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  SDLoc dl(Input);
+  SignExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32_64, dl,
+                                        MVT::i64, Input), 0);
 }
 
-bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
-  SDLoc dl(N);
-  unsigned Imm;
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  EVT PtrVT =
-      CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
-  bool isPPC64 = (PtrVT == MVT::i64);
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Otherwise just reinterpret it as a 64-bit value.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue IntegerCompareEliminator::zeroExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only zero-extend 32-bit values here.");
+  unsigned Opc = Input.getOpcode();
 
-  if (!PPCSubTarget->useCRBits() &&
-      isInt32Immediate(N->getOperand(1), Imm)) {
-    // We can codegen setcc op, imm very efficiently compared to a brcond.
-    // Check for those cases here.
-    // setcc op, 0
-    if (Imm == 0) {
-      SDValue Op = N->getOperand(0);
-      switch (CC) {
-      default: break;
-      case ISD::SETEQ: {
-        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
-        SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
-                          getI32Imm(31, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-        return true;
-      }
-      case ISD::SETNE: {
-        if (isPPC64) break;
-        SDValue AD =
-          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
-                                         Op, getI32Imm(~0U, dl)), 0);
-        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
-        return true;
-      }
-      case ISD::SETLT: {
-        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
-                          getI32Imm(31, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-        return true;
-      }
-      case ISD::SETGT: {
-        SDValue T =
-          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
-        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
-        SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
-                          getI32Imm(31, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-        return true;
-      }
-      }
-    } else if (Imm == ~0U) {        // setcc op, -1
-      SDValue Op = N->getOperand(0);
-      switch (CC) {
-      default: break;
-      case ISD::SETEQ:
-        if (isPPC64) break;
-        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
-                                            Op, getI32Imm(1, dl)), 0);
-        CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
-                             SDValue(CurDAG->getMachineNode(PPC::LI, dl,
-                                                            MVT::i32,
-                                                            getI32Imm(0, dl)),
-                                     0), Op.getValue(1));
-        return true;
-      case ISD::SETNE: {
-        if (isPPC64) break;
-        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
-        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
-                                            Op, getI32Imm(~0U, dl));
-        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
-                             SDValue(AD, 1));
-        return true;
-      }
-      case ISD::SETLT: {
-        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
-                                                    getI32Imm(1, dl)), 0);
-        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
-                                                    Op), 0);
-        SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
-                          getI32Imm(31, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-        return true;
-      }
-      case ISD::SETGT: {
-        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
-                          getI32Imm(31, dl) };
-        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-        CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
-        return true;
-      }
-      }
-    }
+  // The only condition under which we can omit the actual extend instruction:
+  // - The value is a positive constant
+  // - The value comes from a load that isn't a sign-extending load
+  // An ISD::TRUNCATE needs to be zero-extended unless it is fed by a zext.
+  bool IsTruncateOfZExt = Opc == ISD::TRUNCATE &&
+    (Input.getOperand(0).getOpcode() == ISD::AssertZext ||
+     Input.getOperand(0).getOpcode() == ISD::ZERO_EXTEND);
+  if (IsTruncateOfZExt)
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  if (InputConst && InputConst->getSExtValue() >= 0)
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  // The input is a load that doesn't sign-extend (it will be zero-extended).
+  if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+    return addExtOrTrunc(Input, ExtOrTruncConversion::Ext);
+
+  // None of the above, need to zero-extend.
+  SDLoc dl(Input);
+  ZeroExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32_64, dl, MVT::i64, Input,
+                                        S->getI64Imm(0, dl),
+                                        S->getI64Imm(32, dl)), 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue IntegerCompareEliminator::addExtOrTrunc(SDValue NatWidthRes,
+                                                ExtOrTruncConversion Conv) {
+  SDLoc dl(NatWidthRes);
+
+  // For reinterpreting 32-bit values as 64 bit values, we generate
+  // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+  if (Conv == ExtOrTruncConversion::Ext) {
+    SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+    SDValue SubRegIdx =
+      CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+    return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+                                          ImDef, NatWidthRes, SubRegIdx), 0);
   }
 
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
+  assert(Conv == ExtOrTruncConversion::Trunc &&
+         "Unknown convertion between 32 and 64 bit values.");
+  // For reinterpreting 64-bit values as 32-bit values, we just need to
+  // EXTRACT_SUBREG (i.e. extract the low word).
+  SDValue SubRegIdx =
+    CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+  return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+                                        NatWidthRes, SubRegIdx), 0);
+}
 
-  // Altivec Vector compare instructions do not set any CR register by default and
-  // vector compare operations return the same type as the operands.
-  if (LHS.getValueType().isVector()) {
-    if (PPCSubTarget->hasQPX())
-      return false;
+// Produce a GPR sequence for compound comparisons (<=, >=) against zero.
+// Handle both zero-extensions and sign-extensions.
+SDValue
+IntegerCompareEliminator::getCompoundZeroComparisonInGPR(SDValue LHS, SDLoc dl,
+                                                         ZeroCompare CmpTy) {
+  EVT InVT = LHS.getValueType();
+  bool Is32Bit = InVT == MVT::i32;
+  SDValue ToExtend;
+
+  // Produce the value that needs to be either zero or sign extended.
+  switch (CmpTy) {
+  case ZeroCompare::GEZExt:
+  case ZeroCompare::GESExt:
+    ToExtend = SDValue(CurDAG->getMachineNode(Is32Bit ? PPC::NOR : PPC::NOR8,
+                                              dl, InVT, LHS, LHS), 0);
+    break;
+  case ZeroCompare::LEZExt:
+  case ZeroCompare::LESExt: {
+    if (Is32Bit) {
+      // Upper 32 bits cannot be undefined for this sequence.
+      LHS = signExtendInputIfNeeded(LHS);
+      SDValue Neg =
+        SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+      ToExtend =
+        SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                       Neg, S->getI64Imm(1, dl),
+                                       S->getI64Imm(63, dl)), 0);
+    } else {
+      SDValue Addi =
+        SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+                                       S->getI64Imm(~0ULL, dl)), 0);
+      ToExtend = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                                                Addi, LHS), 0);
+    }
+    break;
+  }
+  }
 
-    EVT VecVT = LHS.getValueType();
-    bool Swap, Negate;
-    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
-                                        PPCSubTarget->hasVSX(), Swap, Negate);
-    if (Swap)
-      std::swap(LHS, RHS);
+  // For 64-bit sequences, the extensions are the same for the GE/LE cases.
+  if (!Is32Bit &&
+      (CmpTy == ZeroCompare::GEZExt || CmpTy == ZeroCompare::LEZExt))
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                          ToExtend, S->getI64Imm(1, dl),
+                                          S->getI64Imm(63, dl)), 0);
+  if (!Is32Bit &&
+      (CmpTy == ZeroCompare::GESExt || CmpTy == ZeroCompare::LESExt))
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, ToExtend,
+                                          S->getI64Imm(63, dl)), 0);
+
+  assert(Is32Bit && "Should have handled the 32-bit sequences above.");
+  // For 32-bit sequences, the extensions differ between GE/LE cases.
+  switch (CmpTy) {
+  case ZeroCompare::GEZExt: {
+    SDValue ShiftOps[] = { ToExtend, S->getI32Imm(1, dl), S->getI32Imm(31, dl),
+                           S->getI32Imm(31, dl) };
+    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                          ShiftOps), 0);
+  }
+  case ZeroCompare::GESExt:
+    return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, ToExtend,
+                                          S->getI32Imm(31, dl)), 0);
+  case ZeroCompare::LEZExt:
+    return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, ToExtend,
+                                          S->getI32Imm(1, dl)), 0);
+  case ZeroCompare::LESExt:
+    return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, ToExtend,
+                                          S->getI32Imm(-1, dl)), 0);
+  }
 
-    EVT ResVT = VecVT.changeVectorElementTypeToInteger();
-    if (Negate) {
-      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
-      CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
-                           ResVT, VCmp, VCmp);
-      return true;
+  // The above case covers all the enumerators so it can't have a default clause
+  // to avoid compiler warnings.
+  llvm_unreachable("Unknown zero-comparison type.");
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+                                              ISD::CondCode CC,
+                                              int64_t RHSValue, SDLoc dl) {
+  if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 ||
+      CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Sext)
+    return SDValue();
+  bool IsRHSZero = RHSValue == 0;
+  bool IsRHSOne = RHSValue == 1;
+  bool IsRHSNegOne = RHSValue == -1LL;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+    // (zext (setcc %a, 0, seteq))  -> (lshr (cntlzw %a), 5)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl),
+      S->getI32Imm(31, dl) };
+    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                          ShiftOps), 0);
+  }
+  case ISD::SETNE: {
+    // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1)
+    // (zext (setcc %a, 0, setne))  -> (xor (lshr (cntlzw %a), 5), 1)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl),
+      S->getI32Imm(31, dl) };
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+                                          S->getI32Imm(1, dl)), 0);
+  }
+  case ISD::SETGE: {
+    // (zext (setcc %a, %b, setge)) -> (xor (lshr (sub %a, %b), 63), 1)
+    // (zext (setcc %a, 0, setge))  -> (lshr (~ %a), 31)
+    if(IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+
+    // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a)
+    // by swapping inputs and falling through.
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLE: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // (zext (setcc %a, %b, setle)) -> (xor (lshr (sub %b, %a), 63), 1)
+    // (zext (setcc %a, 0, setle))  -> (xor (lshr (- %a), 63), 1)
+    if(IsRHSZero) {
+      if (CmpInGPR == ICGPR_NonExtIn)
+        return SDValue();
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
     }
 
-    CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
-    return true;
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+    SDValue Sub =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Sub,
+                                     S->getI64Imm(1, dl), S->getI64Imm(63, dl)),
+              0);
+    return
+      SDValue(CurDAG->getMachineNode(PPC::XORI8, dl,
+                                     MVT::i64, Shift, S->getI32Imm(1, dl)), 0);
   }
+  case ISD::SETGT: {
+    // (zext (setcc %a, %b, setgt)) -> (lshr (sub %b, %a), 63)
+    // (zext (setcc %a, -1, setgt)) -> (lshr (~ %a), 31)
+    // (zext (setcc %a, 0, setgt))  -> (lshr (- %a), 63)
+    // Handle SETLT -1 (which is equivalent to SETGE 0).
+    if (IsRHSNegOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+
+    if (IsRHSZero) {
+      if (CmpInGPR == ICGPR_NonExtIn)
+        return SDValue();
+      // The upper 32-bits of the register can't be undefined for this sequence.
+      LHS = signExtendInputIfNeeded(LHS);
+      RHS = signExtendInputIfNeeded(RHS);
+      SDValue Neg =
+        SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                     Neg, S->getI32Imm(1, dl), S->getI32Imm(63, dl)), 0);
+    }
+    // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as
+    // (%b < %a) by swapping inputs and falling through.
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLT: {
+    // (zext (setcc %a, %b, setlt)) -> (lshr (sub %a, %b), 63)
+    // (zext (setcc %a, 1, setlt))  -> (xor (lshr (- %a), 63), 1)
+    // (zext (setcc %a, 0, setlt))  -> (lshr %a, 31)
+    // Handle SETLT 1 (which is equivalent to SETLE 0).
+    if (IsRHSOne) {
+      if (CmpInGPR == ICGPR_NonExtIn)
+        return SDValue();
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+    }
 
-  if (PPCSubTarget->useCRBits())
-    return false;
-
-  bool Inv;
-  unsigned Idx = getCRIdxForSetCC(CC, Inv);
-  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
-  SDValue IntCR;
-
-  // Force the ccreg into CR7.
-  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+    if (IsRHSZero) {
+      SDValue ShiftOps[] = { LHS, S->getI32Imm(1, dl), S->getI32Imm(31, dl),
+                             S->getI32Imm(31, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                            ShiftOps), 0);
+    }
 
-  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
-  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
-                               InFlag).getValue(1);
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+    SDValue SUBFNode =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                    SUBFNode, S->getI64Imm(1, dl),
+                                    S->getI64Imm(63, dl)), 0);
+  }
+  case ISD::SETUGE:
+    // (zext (setcc %a, %b, setuge)) -> (xor (lshr (sub %b, %a), 63), 1)
+    // (zext (setcc %a, %b, setule)) -> (xor (lshr (sub %a, %b), 63), 1)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULE: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+    SDValue Subtract =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue SrdiNode =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                          Subtract, S->getI64Imm(1, dl),
+                                          S->getI64Imm(63, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64, SrdiNode,
+                                            S->getI32Imm(1, dl)), 0);
+  }
+  case ISD::SETUGT:
+    // (zext (setcc %a, %b, setugt)) -> (lshr (sub %b, %a), 63)
+    // (zext (setcc %a, %b, setult)) -> (lshr (sub %a, %b), 63)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULT: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+    SDValue Subtract =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                          Subtract, S->getI64Imm(1, dl),
+                                          S->getI64Imm(63, dl)), 0);
+  }
+  }
+}
 
-  IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
-                                         CCReg), 0);
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+                                              ISD::CondCode CC,
+                                              int64_t RHSValue, SDLoc dl) {
+  if (CmpInGPR == ICGPR_I64 || CmpInGPR == ICGPR_SextI64 ||
+      CmpInGPR == ICGPR_ZextI64 || CmpInGPR == ICGPR_Zext)
+    return SDValue();
+  bool IsRHSZero = RHSValue == 0;
+  bool IsRHSOne = RHSValue == 1;
+  bool IsRHSNegOne = RHSValue == -1LL;
 
-  SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
-                      getI32Imm(31, dl), getI32Imm(31, dl) };
-  if (!Inv) {
-    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-    return true;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (sext (setcc %a, %b, seteq)) ->
+    //   (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+    // (sext (setcc %a, 0, seteq)) ->
+    //   (ashr (shl (ctlz %a), 58), 63)
+    SDValue CountInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Cntlzw =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+    SDValue SHLOps[] = { Cntlzw, S->getI32Imm(27, dl),
+                         S->getI32Imm(5, dl), S->getI32Imm(31, dl) };
+    SDValue Slwi =
+      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, SHLOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Slwi), 0);
+  }
+  case ISD::SETNE: {
+    // Bitwise xor the operands, count leading zeros, shift right by 5 bits and
+    // flip the bit, finally take 2's complement.
+    // (sext (setcc %a, %b, setne)) ->
+    //   (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1))
+    // Same as above, but the first xor is not needed.
+    // (sext (setcc %a, 0, setne)) ->
+    //   (neg (xor (lshr (ctlz %a), 5), 1))
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] =
+      { Clz, S->getI32Imm(27, dl), S->getI32Imm(5, dl), S->getI32Imm(31, dl) };
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+    SDValue Xori =
+      SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+                                     S->getI32Imm(1, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0);
+  }
+  case ISD::SETGE: {
+    // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %a, %b), 63), -1)
+    // (sext (setcc %a, 0, setge))  -> (ashr (~ %a), 31)
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+
+    // Not a special case (i.e. RHS == 0). Handle (%a >= %b) as (%b <= %a)
+    // by swapping inputs and falling through.
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLE: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // (sext (setcc %a, %b, setge)) -> (add (lshr (sub %b, %a), 63), -1)
+    // (sext (setcc %a, 0, setle))  -> (add (lshr (- %a), 63), -1)
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+    SDValue SUBFNode =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, MVT::Glue,
+                                     LHS, RHS), 0);
+    SDValue Srdi =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                     SUBFNode, S->getI64Imm(1, dl),
+                                     S->getI64Imm(63, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Srdi,
+                                          S->getI32Imm(-1, dl)), 0);
+  }
+  case ISD::SETGT: {
+    // (sext (setcc %a, %b, setgt)) -> (ashr (sub %b, %a), 63)
+    // (sext (setcc %a, -1, setgt)) -> (ashr (~ %a), 31)
+    // (sext (setcc %a, 0, setgt))  -> (ashr (- %a), 63)
+    if (IsRHSNegOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+    if (IsRHSZero) {
+      if (CmpInGPR == ICGPR_NonExtIn)
+        return SDValue();
+      // The upper 32-bits of the register can't be undefined for this sequence.
+      LHS = signExtendInputIfNeeded(LHS);
+      RHS = signExtendInputIfNeeded(RHS);
+      SDValue Neg =
+        SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, LHS), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Neg,
+                                              S->getI64Imm(63, dl)), 0);
+    }
+    // Not a special case (i.e. RHS == 0 or RHS == -1). Handle (%a > %b) as
+    // (%b < %a) by swapping inputs and falling through.
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+    LLVM_FALLTHROUGH;
   }
+  case ISD::SETLT: {
+    // (sext (setcc %a, %b, setgt)) -> (ashr (sub %a, %b), 63)
+    // (sext (setcc %a, 1, setgt))  -> (add (lshr (- %a), 63), -1)
+    // (sext (setcc %a, 0, setgt))  -> (ashr %a, 31)
+    if (IsRHSOne) {
+      if (CmpInGPR == ICGPR_NonExtIn)
+        return SDValue();
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+    }
+    if (IsRHSZero)
+      return SDValue(CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, LHS,
+                                            S->getI32Imm(31, dl)), 0);
 
-  // Get the specified bit.
-  SDValue Tmp =
-    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-  CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
-  return true;
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+    SDValue SUBFNode =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+                                          SUBFNode, S->getI64Imm(63, dl)), 0);
+  }
+  case ISD::SETUGE:
+    // (sext (setcc %a, %b, setuge)) -> (add (lshr (sub %a, %b), 63), -1)
+    // (sext (setcc %a, %b, setule)) -> (add (lshr (sub %b, %a), 63), -1)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULE: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+    SDValue Subtract =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Subtract,
+                                     S->getI32Imm(1, dl), S->getI32Imm(63,dl)),
+              0);
+    return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, Shift,
+                                          S->getI32Imm(-1, dl)), 0);
+  }
+  case ISD::SETUGT:
+    // (sext (setcc %a, %b, setugt)) -> (ashr (sub %b, %a), 63)
+    // (sext (setcc %a, %b, setugt)) -> (ashr (sub %a, %b), 63)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULT: {
+    if (CmpInGPR == ICGPR_NonExtIn)
+      return SDValue();
+    // The upper 32-bits of the register can't be undefined for this sequence.
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+    SDValue Subtract =
+      SDValue(CurDAG->getMachineNode(PPC::SUBF8, dl, MVT::i64, RHS, LHS), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+                                          Subtract, S->getI64Imm(63, dl)), 0);
+  }
+  }
 }
 
-// Is this opcode a bitwise logical operation?
-static bool isLogicOp(unsigned Opc) {
-  return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
+/// Produces a zero-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get64BitZExtCompare(SDValue LHS, SDValue RHS,
+                                              ISD::CondCode CC,
+                                              int64_t RHSValue, SDLoc dl) {
+  if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 ||
+      CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Sext)
+    return SDValue();
+  bool IsRHSZero = RHSValue == 0;
+  bool IsRHSOne = RHSValue == 1;
+  bool IsRHSNegOne = RHSValue == -1LL;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
+    // (zext (setcc %a, 0, seteq)) ->  (lshr (ctlz %a), 6)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
+                                          S->getI64Imm(58, dl),
+                                          S->getI64Imm(63, dl)), 0);
+  }
+  case ISD::SETNE: {
+    // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+    // (zext (setcc %a, %b, setne)) -> (sube addc.reg, addc.reg, addc.CA)
+    // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+    // (zext (setcc %a, 0, setne)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue AC =
+      SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+                                     Xor, S->getI32Imm(~0U, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, AC,
+                                          Xor, AC.getValue(1)), 0);
+  }
+  case ISD::SETGE: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setge)) ->
+    //   (adde (lshr %b, 63), (ashr %a, 63), subc.CA)
+    // (zext (setcc %a, 0, setge)) -> (lshr (~ %a), 63)
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLE: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setge)) ->
+    //   (adde (lshr %a, 63), (ashr %b, 63), subc.CA)
+    // (zext (setcc %a, 0, setge)) -> (lshr (or %a, (add %a, -1)), 63)
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+    SDValue ShiftL =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+                                     S->getI64Imm(1, dl),
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue ShiftR =
+      SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS,
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue SubtractCarry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     LHS, RHS), 1);
+    return SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+                                          ShiftR, ShiftL, SubtractCarry), 0);
+  }
+  case ISD::SETGT: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setgt)) ->
+    //   (xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1)
+    // (zext (setcc %a, 0, setgt)) -> (lshr (nor (add %a, -1), %a), 63)
+    if (IsRHSNegOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GEZExt);
+    if (IsRHSZero) {
+      SDValue Addi =
+        SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+                                       S->getI64Imm(~0ULL, dl)), 0);
+      SDValue Nor =
+        SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Addi, LHS), 0);
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Nor,
+                                            S->getI64Imm(1, dl),
+                                            S->getI64Imm(63, dl)), 0);
+    }
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLT: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setlt)) ->
+    //   (xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1)
+    // (zext (setcc %a, 0, setlt)) -> (lshr %a, 63)
+    if (IsRHSOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LEZExt);
+    if (IsRHSZero)
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+                                            S->getI64Imm(1, dl),
+                                            S->getI64Imm(63, dl)), 0);
+    SDValue SRADINode =
+      SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+                                     LHS, S->getI64Imm(63, dl)), 0);
+    SDValue SRDINode =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                     RHS, S->getI64Imm(1, dl),
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue SUBFC8Carry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     RHS, LHS), 1);
+    SDValue ADDE8Node =
+      SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+                                     SRDINode, SRADINode, SUBFC8Carry), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+                                          ADDE8Node, S->getI64Imm(1, dl)), 0);
+  }
+  case ISD::SETUGE:
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setuge)) -> (add (sube %b, %b, subc.CA), 1)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULE: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setule)) -> (add (sube %a, %a, subc.CA), 1)
+    SDValue SUBFC8Carry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     LHS, RHS), 1);
+    SDValue SUBFE8Node =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue,
+                                     LHS, LHS, SUBFC8Carry), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64,
+                                          SUBFE8Node, S->getI64Imm(1, dl)), 0);
+  }
+  case ISD::SETUGT:
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setugt)) -> -(sube %b, %b, subc.CA)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULT: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setult)) -> -(sube %a, %a, subc.CA)
+    SDValue SubtractCarry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     RHS, LHS), 1);
+    SDValue ExtSub =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64,
+                                     LHS, LHS, SubtractCarry), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64,
+                                          ExtSub), 0);
+  }
+  }
 }
 
-/// If this node is a sign/zero extension of an integer comparison,
-/// it can usually be computed in GPR's rather than using comparison
-/// instructions and ISEL. We only do this on 64-bit targets for now
-/// as the code is specialized for 64-bit (it uses 64-bit instructions
-/// and assumes 64-bit registers).
-bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
-  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
-    return false;
-  assert((N->getOpcode() == ISD::ZERO_EXTEND ||
-          N->getOpcode() == ISD::SIGN_EXTEND) &&
-          "Expecting a zero/sign extend node!");
-
-  SDValue WideRes;
-  // If we are zero-extending the result of a logical operation on i1
-  // values, we can keep the values in GPRs.
-  if (isLogicOp(N->getOperand(0).getOpcode()) &&
-      N->getOperand(0).getValueType() == MVT::i1 &&
-      N->getOpcode() == ISD::ZERO_EXTEND)
-    WideRes = computeLogicOpInGPR(N->getOperand(0));
-  else if (N->getOperand(0).getOpcode() != ISD::SETCC)
-    return false;
-  else
-    WideRes =
-      getSETCCInGPR(N->getOperand(0),
-                    N->getOpcode() == ISD::SIGN_EXTEND ?
-                    SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
-
-  if (!WideRes)
-    return false;
-
-  SDLoc dl(N);
-  bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
-  bool Output32Bit = N->getValueType(0) == MVT::i32;
-
-  NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
-  NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+/// Produces a sign-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue
+IntegerCompareEliminator::get64BitSExtCompare(SDValue LHS, SDValue RHS,
+                                              ISD::CondCode CC,
+                                              int64_t RHSValue, SDLoc dl) {
+  if (CmpInGPR == ICGPR_I32 || CmpInGPR == ICGPR_SextI32 ||
+      CmpInGPR == ICGPR_ZextI32 || CmpInGPR == ICGPR_Zext)
+    return SDValue();
+  bool IsRHSZero = RHSValue == 0;
+  bool IsRHSOne = RHSValue == 1;
+  bool IsRHSNegOne = RHSValue == -1LL;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+    // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
+    // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+    // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+    SDValue AddInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Addic =
+      SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+                                     AddInput, S->getI32Imm(~0U, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
+                                          Addic, Addic.getValue(1)), 0);
+  }
+  case ISD::SETNE: {
+    // {subfc.reg, subfc.CA} = (subcarry 0, (xor %a, %b))
+    // (sext (setcc %a, %b, setne)) -> (sube subfc.reg, subfc.reg, subfc.CA)
+    // {subfcz.reg, subfcz.CA} = (subcarry 0, %a)
+    // (sext (setcc %a, 0, setne)) -> (sube subfcz.reg, subfcz.reg, subfcz.CA)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue SC =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFIC8, dl, MVT::i64, MVT::Glue,
+                                     Xor, S->getI32Imm(0, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, SC,
+                                          SC, SC.getValue(1)), 0);
+  }
+  case ISD::SETGE: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setge)) ->
+    //   (- (adde (lshr %b, 63), (ashr %a, 63), subc.CA))
+    // (zext (setcc %a, 0, setge)) -> (~ (ashr %a, 63))
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLE: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setge)) ->
+    //   (- (adde (lshr %a, 63), (ashr %b, 63), subc.CA))
+    // (zext (setcc %a, 0, setge)) -> (ashr (or %a, (add %a, -1)), 63)
+    if (IsRHSZero)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+    SDValue ShiftR =
+      SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, RHS,
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue ShiftL =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, LHS,
+                                     S->getI64Imm(1, dl),
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue SubtractCarry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     LHS, RHS), 1);
+    SDValue Adde =
+      SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64, MVT::Glue,
+                                     ShiftR, ShiftL, SubtractCarry), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64, Adde), 0);
+  }
+  case ISD::SETGT: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (zext (setcc %a, %b, setgt)) ->
+    //   -(xor (adde (lshr %a, 63), (ashr %b, 63), subc.CA), 1)
+    // (zext (setcc %a, 0, setgt)) -> (ashr (nor (add %a, -1), %a), 63)
+    if (IsRHSNegOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::GESExt);
+    if (IsRHSZero) {
+      SDValue Add =
+        SDValue(CurDAG->getMachineNode(PPC::ADDI8, dl, MVT::i64, LHS,
+                                       S->getI64Imm(-1, dl)), 0);
+      SDValue Nor =
+        SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64, Add, LHS), 0);
+      return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, Nor,
+                                            S->getI64Imm(63, dl)), 0);
+    }
+    std::swap(LHS, RHS);
+    ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+    IsRHSZero = RHSConst && RHSConst->isNullValue();
+    IsRHSOne = RHSConst && RHSConst->getSExtValue() == 1;
+    LLVM_FALLTHROUGH;
+  }
+  case ISD::SETLT: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (zext (setcc %a, %b, setlt)) ->
+    //   -(xor (adde (lshr %b, 63), (ashr %a, 63), subc.CA), 1)
+    // (zext (setcc %a, 0, setlt)) -> (ashr %a, 63)
+    if (IsRHSOne)
+      return getCompoundZeroComparisonInGPR(LHS, dl, ZeroCompare::LESExt);
+    if (IsRHSZero) {
+      return SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, LHS,
+                                            S->getI64Imm(63, dl)), 0);
+    }
+    SDValue SRADINode =
+      SDValue(CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64,
+                                     LHS, S->getI64Imm(63, dl)), 0);
+    SDValue SRDINode =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64,
+                                     RHS, S->getI64Imm(1, dl),
+                                     S->getI64Imm(63, dl)), 0);
+    SDValue SUBFC8Carry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     RHS, LHS), 1);
+    SDValue ADDE8Node =
+      SDValue(CurDAG->getMachineNode(PPC::ADDE8, dl, MVT::i64,
+                                     SRDINode, SRADINode, SUBFC8Carry), 0);
+    SDValue XORI8Node =
+      SDValue(CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+                                     ADDE8Node, S->getI64Imm(1, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG8, dl, MVT::i64,
+                                          XORI8Node), 0);
+  }
+  case ISD::SETUGE:
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (sext (setcc %a, %b, setuge)) -> ~(sube %b, %b, subc.CA)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULE: {
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (sext (setcc %a, %b, setule)) -> ~(sube %a, %a, subc.CA)
+    SDValue SubtractCarry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     LHS, RHS), 1);
+    SDValue ExtSub =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, MVT::Glue, LHS,
+                                     LHS, SubtractCarry), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NOR8, dl, MVT::i64,
+                                          ExtSub, ExtSub), 0);
+  }
+  case ISD::SETUGT:
+    // {subc.reg, subc.CA} = (subcarry %b, %a)
+    // (sext (setcc %a, %b, setugt)) -> (sube %b, %b, subc.CA)
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULT: {
+    // {subc.reg, subc.CA} = (subcarry %a, %b)
+    // (sext (setcc %a, %b, setult)) -> (sube %a, %a, subc.CA)
+    SDValue SubCarry =
+      SDValue(CurDAG->getMachineNode(PPC::SUBFC8, dl, MVT::i64, MVT::Glue,
+                                     RHS, LHS), 1);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64,
+                                     LHS, LHS, SubCarry), 0);
+  }
+  }
+}
 
-  SDValue ConvOp = WideRes;
-  if (Inputs32Bit != Output32Bit)
-    ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
-                           ExtOrTruncConversion::Trunc);
-  ReplaceNode(N, ConvOp.getNode());
+/// Do all uses of this SDValue need the result in a GPR?
+/// This is meant to be used on values that have type i1 since
+/// it is somewhat meaningless to ask if values of other types
+/// should be kept in GPR's.
+static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
+  assert(Compare.getOpcode() == ISD::SETCC &&
+         "An ISD::SETCC node required here.");
 
+  // For values that have a single use, the caller should obviously already have
+  // checked if that use is an extending use. We check the other uses here.
+  if (Compare.hasOneUse())
+    return true;
+  // We want the value in a GPR if it is being extended, used for a select, or
+  // used in logical operations.
+  for (auto CompareUse : Compare.getNode()->uses())
+    if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
+        CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
+        CompareUse->getOpcode() != ISD::SELECT &&
+        !isLogicOp(CompareUse->getOpcode())) {
+      OmittedForNonExtendUses++;
+      return false;
+    }
   return true;
 }
 
-// Lower a logical operation on i1 values into a GPR sequence if possible.
-// The result can be kept in a GPR if requested.
-// Three types of inputs can be handled:
-// - SETCC
-// - TRUNCATE
-// - Logical operation (AND/OR/XOR)
-// There is also a special case that is handled (namely a complement operation
-// achieved with xor %a, -1).
-SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) {
-  assert(isLogicOp(LogicOp.getOpcode()) &&
-         "Can only handle logic operations here.");
-  assert(LogicOp.getValueType() == MVT::i1 &&
-         "Can only handle logic operations on i1 values here.");
-  SDLoc dl(LogicOp);
-  SDValue LHS, RHS;
-
-  // Special case: xor %a, -1
-  bool IsBitwiseNegation = isBitwiseNot(LogicOp);
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare,
+                                                SetccInGPROpts ConvOpts) {
+  assert((Compare.getOpcode() == ISD::SETCC ||
+          Compare.getOpcode() == ISD::SELECT_CC) &&
+         "An ISD::SETCC node required here.");
 
-  // Produces a GPR sequence for each operand of the binary logic operation.
-  // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
-  // the value in a GPR and for logic operations, it will recursively produce
-  // a GPR sequence for the operation.
-  auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
-    unsigned OperandOpcode = Operand.getOpcode();
-    if (OperandOpcode == ISD::SETCC)
-      return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
-    else if (OperandOpcode == ISD::TRUNCATE) {
-      SDValue InputOp = Operand.getOperand(0);
-      EVT InVT = InputOp.getValueType();
-      return
-        SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
-                                       PPC::RLDICL, dl, InVT, InputOp,
-                                       getI64Imm(0, dl), getI64Imm(63, dl)), 0);
-    } else if (isLogicOp(OperandOpcode))
-      return computeLogicOpInGPR(Operand);
+  // Don't convert this comparison to a GPR sequence because there are uses
+  // of the i1 result (i.e. uses that require the result in the CR).
+  if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
     return SDValue();
-  };
-  LHS = getLogicOperand(LogicOp.getOperand(0));
-  RHS = getLogicOperand(LogicOp.getOperand(1));
 
-  // If a GPR sequence can't be produced for the LHS we can't proceed.
-  // Not producing a GPR sequence for the RHS is only a problem if this isn't
-  // a bitwise negation operation.
-  if (!LHS || (!RHS && !IsBitwiseNegation))
+  SDValue LHS = Compare.getOperand(0);
+  SDValue RHS = Compare.getOperand(1);
+
+  // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+  int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+  ISD::CondCode CC =
+    cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+  EVT InputVT = LHS.getValueType();
+  if (InputVT != MVT::i32 && InputVT != MVT::i64)
     return SDValue();
 
-  NumLogicOpsOnComparison++;
+  if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+      ConvOpts == SetccInGPROpts::SExtInvert)
+    CC = ISD::getSetCCInverse(CC, true);
 
-  // We will use the inputs as 64-bit values.
-  if (LHS.getValueType() == MVT::i32)
-    LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
-  if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
-    RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
+  bool Inputs32Bit = InputVT == MVT::i32;
 
-  unsigned NewOpc;
-  switch (LogicOp.getOpcode()) {
-  default: llvm_unreachable("Unknown logic operation.");
-  case ISD::AND: NewOpc = PPC::AND8; break;
-  case ISD::OR:  NewOpc = PPC::OR8;  break;
-  case ISD::XOR: NewOpc = PPC::XOR8; break;
-  }
+  SDLoc dl(Compare);
+  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+  bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+    ConvOpts == SetccInGPROpts::SExtInvert;
 
-  if (IsBitwiseNegation) {
-    RHS = getI64Imm(1, dl);
-    NewOpc = PPC::XORI8;
-  }
+  if (IsSext && Inputs32Bit)
+    return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (Inputs32Bit)
+    return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (IsSext)
+    return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
 
-  return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
+} // end anonymous namespace
 
-}
+bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) {
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return false;
 
-/// Try performing logical operations on results of comparisons in GPRs.
-/// It is typically preferred from a performance perspective over performing
-/// the operations on individual bits in the CR. We only do this on 64-bit
-/// targets for now as the code is specialized for 64-bit (it uses 64-bit
-/// instructions and assumes 64-bit registers).
-bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) {
+  // This optimization will emit code that assumes 64-bit registers
+  // so we don't want to run it in 32-bit mode. Also don't run it
+  // on functions that are not to be optimized.
   if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
     return false;
-  if (N->getValueType(0) != MVT::i1)
-    return false;
-  assert(isLogicOp(N->getOpcode()) &&
-         "Expected a logic operation on setcc results.");
-  SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
-  if (!LoweredLogical)
-    return false;
-
-  SDLoc dl(N);
-  bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
-  unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
-  SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
-  SDValue LHS = LoweredLogical.getOperand(0);
-  SDValue RHS = LoweredLogical.getOperand(1);
-  SDValue WideOp;
-  SDValue OpToConvToRecForm;
-
-  // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode
-  // that is input to the XORI.
-  if (IsBitwiseNegate &&
-      LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
-    OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
-  else if (IsBitwiseNegate)
-    // If the input to the XORI isn't an extension, that's what we're after.
-    OpToConvToRecForm = LoweredLogical.getOperand(0);
-  else
-    // If this is not an XORI, it is a reg-reg logical op and we can convert it
-    // to record-form.
-    OpToConvToRecForm = LoweredLogical;
-
-  // Get the record-form version of the node we're looking to use to get the
-  // CR result from.
-  uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
-  int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
-
-  // Convert the right node to record-form. This is either the logical we're
-  // looking at or it is the input node to the negation (if we're looking at
-  // a bitwise negation).
-  if (NewOpc != -1 && IsBitwiseNegate) {
-    // The input to the XORI has a record-form. Use it.
-    assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
-           "Expected a PPC::XORI8 only for bitwise negation.");
-    // Emit the record-form instruction.
-    std::vector<SDValue> Ops;
-    for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
-      Ops.push_back(OpToConvToRecForm.getOperand(i));
 
-    WideOp =
-      SDValue(CurDAG->getMachineNode(NewOpc, dl,
-                                     OpToConvToRecForm.getValueType(),
-                                     MVT::Glue, Ops), 0);
-  } else {
-    assert((NewOpc != -1 || !IsBitwiseNegate) &&
-           "No record form available for AND8/OR8/XOR8?");
-    WideOp =
-      SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
-                                     MVT::i64, MVT::Glue, LHS, RHS), 0);
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    IntegerCompareEliminator ICmpElim(CurDAG, this);
+    if (SDNode *New = ICmpElim.Select(N)) {
+      ReplaceNode(N, New);
+      return true;
+    }
   }
-
-  // Select this node to a single bit from CR0 set by the record-form node
-  // just created. For bitwise negation, use the EQ bit which is the equivalent
-  // of negating the result (i.e. it is a bit set when the result of the
-  // operation is zero).
-  SDValue SRIdxVal =
-    CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
-  SDValue CRBit =
-    SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
-                                   MVT::i1, CR0Reg, SRIdxVal,
-                                   WideOp.getValue(1)), 0);
-  ReplaceNode(N, CRBit.getNode());
-  return true;
+  }
+  return false;
 }
 
-/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
-/// Useful when emitting comparison code for 32-bit values without using
-/// the compare instruction (which only considers the lower 32-bits).
-SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
-  assert(Input.getValueType() == MVT::i32 &&
-         "Can only sign-extend 32-bit values here.");
-  unsigned Opc = Input.getOpcode();
-
-  // The value was sign extended and then truncated to 32-bits. No need to
-  // sign extend it again.
-  if (Opc == ISD::TRUNCATE &&
-      (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
-       Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
-    return Input;
+bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return false;
 
-  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
-  // The input is a sign-extending load. No reason to sign-extend.
-  if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
-    return Input;
+  if (!UseBitPermRewriter)
+    return false;
 
-  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
-  // We don't sign-extend constants and already sign-extended values.
-  if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
-      Opc == ISD::SIGN_EXTEND)
-    return Input;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::OR: {
+    BitPermutationSelector BPS(CurDAG);
+    if (SDNode *New = BPS.Select(N)) {
+      ReplaceNode(N, New);
+      return true;
+    }
+    return false;
+  }
+  }
 
-  SDLoc dl(Input);
-  SignExtensionsAdded++;
-  return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+  return false;
 }
 
-/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
-/// Useful when emitting comparison code for 32-bit values without using
-/// the compare instruction (which only considers the lower 32-bits).
-SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
-  assert(Input.getValueType() == MVT::i32 &&
-         "Can only zero-extend 32-bit values here.");
-  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
-  unsigned Opc = Input.getOpcode();
-
-  // No need to zero-extend loaded values (unless they're loaded with
-  // a sign-extending load).
-  if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
-    return Input;
-
-  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
-  bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
-  // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
-  // to conservatively actually clear the high bits. We also don't need to
-  // zero-extend constants or values that are already zero-extended.
-  if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
-    return Input;
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                  const SDLoc &dl) {
+  // Always select the LHS.
+  unsigned Opc;
 
-  SDLoc dl(Input);
-  ZeroExtensionsAdded++;
-  return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
-                                        getI64Imm(0, dl), getI64Imm(32, dl)),
-                 0);
-}
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>((int)Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
 
-// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
-// course not actual zero/sign extensions that will generate machine code,
-// they're just a way to reinterpret a 32 bit value in a register as a
-// 64 bit value and vice-versa.
-SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
-                                       ExtOrTruncConversion Conv) {
-  SDLoc dl(NatWidthRes);
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16, dl)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      int16_t SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF,
+                                                        dl)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
 
-  // For reinterpreting 32-bit values as 64 bit values, we generate
-  // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
-  if (Conv == ExtOrTruncConversion::Ext) {
-    SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
-    SDValue SubRegIdx =
-      CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
-    return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
-                                          ImDef, NatWidthRes, SubRegIdx), 0);
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt<32>(Imm)) {
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16, dl)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF, dl)),
+                         0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF, dl)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      int16_t SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF, dl)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   }
-
-  assert(Conv == ExtOrTruncConversion::Trunc &&
-         "Unknown convertion between 32 and 64 bit values.");
-  // For reinterpreting 64-bit values as 32-bit values, we just need to
-  // EXTRACT_SUBREG (i.e. extract the low word).
-  SDValue SubRegIdx =
-    CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
-  return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
-                                        NatWidthRes, SubRegIdx), 0);
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
 
-/// Produces a zero-extended result of comparing two 32-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
-                                             ISD::CondCode CC,
-                                             int64_t RHSValue, SDLoc dl) {
-  bool IsRHSZero = RHSValue == 0;
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
   switch (CC) {
-  default: return SDValue();
-  case ISD::SETEQ: {
-    // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
-    // (zext (setcc %a, 0, seteq))  -> (lshr (cntlzw %a), 5)
-    SDValue Xor = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
-    SDValue Clz =
-      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
-    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
-      getI32Imm(31, dl) };
-    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
-                                          ShiftOps), 0);
-  }
-  case ISD::SETNE: {
-    // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1)
-    // (zext (setcc %a, 0, setne))  -> (xor (lshr (cntlzw %a), 5), 1)
-    SDValue Xor = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
-    SDValue Clz =
-      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
-    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
-      getI32Imm(31, dl) };
-    SDValue Shift =
-      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
-    return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
-                                          getI32Imm(1, dl)), 0);
-  }
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
   }
 }
 
-/// Produces a sign-extended result of comparing two 32-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
-                                             ISD::CondCode CC,
-                                             int64_t RHSValue, SDLoc dl) {
-  bool IsRHSZero = RHSValue == 0;
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
+  Invert = false;
   switch (CC) {
-  default: return SDValue();
-  case ISD::SETEQ: {
-    // (sext (setcc %a, %b, seteq)) ->
-    //   (ashr (shl (ctlz (xor %a, %b)), 58), 63)
-    // (sext (setcc %a, 0, seteq)) ->
-    //   (ashr (shl (ctlz %a), 58), 63)
-    SDValue CountInput = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
-    SDValue Cntlzw =
-      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
-    SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
-    SDValue Sldi =
-      SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
-    return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
-                                          getI32Imm(63, dl)), 0);
-  }
-  case ISD::SETNE: {
-    // Bitwise xor the operands, count leading zeros, shift right by 5 bits and
-    // flip the bit, finally take 2's complement.
-    // (sext (setcc %a, %b, setne)) ->
-    //   (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1))
-    // Same as above, but the first xor is not needed.
-    // (sext (setcc %a, 0, setne)) ->
-    //   (neg (xor (lshr (ctlz %a), 5), 1))
-    SDValue Xor = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
-    SDValue Clz =
-      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
-    SDValue ShiftOps[] =
-      { Clz, getI32Imm(27, dl), getI32Imm(5, dl), getI32Imm(31, dl) };
-    SDValue Shift =
-      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
-    SDValue Xori =
-      SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
-                                     getI32Imm(1, dl)), 0);
-    return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0);
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
   }
+}
+
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
+static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
+                                bool HasVSX, bool &Swap, bool &Negate) {
+  Swap = false;
+  Negate = false;
+
+  if (VecVT.isFloatingPoint()) {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
+      case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
+      case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPEQDP;
+        break;
+      case ISD::SETGT:
+      case ISD::SETOGT:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGTDP;
+        break;
+      case ISD::SETGE:
+      case ISD::SETOGE:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGEDP;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid floating-point vector compare condition");
+  } else {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
+      case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETUEQ:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPEQUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPEQUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPEQUW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPEQUD;
+        break;
+      case ISD::SETGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTSB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTSH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTSW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPGTSD;
+        break;
+      case ISD::SETUGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTUW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPGTUD;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid integer vector compare condition");
   }
 }
 
-/// Produces a zero-extended result of comparing two 64-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS,
-                                             ISD::CondCode CC,
-                                             int64_t RHSValue, SDLoc dl) {
-  bool IsRHSZero = RHSValue == 0;
-  switch (CC) {
-  default: return SDValue();
-  case ISD::SETEQ: {
-    // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
-    // (zext (setcc %a, 0, seteq)) ->  (lshr (ctlz %a), 6)
-    SDValue Xor = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
-    SDValue Clz =
-      SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
-    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
-                                          getI64Imm(58, dl), getI64Imm(63, dl)),
-                   0);
-  }
-  }
-}
+bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
+  SDLoc dl(N);
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT PtrVT =
+      CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
+  bool isPPC64 = (PtrVT == MVT::i64);
 
-/// Produces a sign-extended result of comparing two 64-bit values according to
-/// the passed condition code.
-SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS,
-                                             ISD::CondCode CC,
-                                             int64_t RHSValue, SDLoc dl) {
-  bool IsRHSZero = RHSValue == 0;
-  switch (CC) {
-  default: return SDValue();
-  case ISD::SETEQ: {
-    // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
-    // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
-    // {addcz.reg, addcz.CA} = (addcarry %a, -1)
-    // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
-    SDValue AddInput = IsRHSZero ? LHS :
-      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
-    SDValue Addic =
-      SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
-                                     AddInput, getI32Imm(~0U, dl)), 0);
-    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
-                                          Addic, Addic.getValue(1)), 0);
-  }
+  if (!PPCSubTarget->useCRBits() &&
+      isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        SDValue AD =
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                         Op, getI32Imm(~0U, dl)), 0);
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
+        return true;
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(1, dl)), 0);
+        CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                             SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+                                                            MVT::i32,
+                                                            getI32Imm(0, dl)),
+                                     0), Op.getValue(1));
+        return true;
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(~0U, dl));
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
+                             SDValue(AD, 1));
+        return true;
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1, dl)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+        CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
+        return true;
+      }
+      }
+    }
   }
-}
 
-/// Does this SDValue have any uses for which keeping the value in a GPR is
-/// appropriate. This is meant to be used on values that have type i1 since
-/// it is somewhat meaningless to ask if values of other types can be kept in
-/// GPR's.
-static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
-  assert(Compare.getOpcode() == ISD::SETCC &&
-         "An ISD::SETCC node required here.");
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
 
-  // For values that have a single use, the caller should obviously already have
-  // checked if that use is an extending use. We check the other uses here.
-  if (Compare.hasOneUse())
-    return true;
-  // We want the value in a GPR if it is being extended, used for a select, or
-  // used in logical operations.
-  for (auto CompareUse : Compare.getNode()->uses())
-    if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
-        CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
-        CompareUse->getOpcode() != ISD::SELECT &&
-        !isLogicOp(CompareUse->getOpcode())) {
-      OmittedForNonExtendUses++;
+  // Altivec Vector compare instructions do not set any CR register by default and
+  // vector compare operations return the same type as the operands.
+  if (LHS.getValueType().isVector()) {
+    if (PPCSubTarget->hasQPX())
       return false;
+
+    EVT VecVT = LHS.getValueType();
+    bool Swap, Negate;
+    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
+                                        PPCSubTarget->hasVSX(), Swap, Negate);
+    if (Swap)
+      std::swap(LHS, RHS);
+
+    EVT ResVT = VecVT.changeVectorElementTypeToInteger();
+    if (Negate) {
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
+      CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
+                           ResVT, VCmp, VCmp);
+      return true;
     }
-  return true;
-}
 
-/// Returns an equivalent of a SETCC node but with the result the same width as
-/// the inputs. This can nalso be used for SELECT_CC if either the true or false
-/// values is a power of two while the other is zero.
-SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
-                                       SetccInGPROpts ConvOpts) {
-  assert((Compare.getOpcode() == ISD::SETCC ||
-          Compare.getOpcode() == ISD::SELECT_CC) &&
-         "An ISD::SETCC node required here.");
+    CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
+    return true;
+  }
 
-  // Don't convert this comparison to a GPR sequence because there are uses
-  // of the i1 result (i.e. uses that require the result in the CR).
-  if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
-    return SDValue();
+  if (PPCSubTarget->useCRBits())
+    return false;
 
-  SDValue LHS = Compare.getOperand(0);
-  SDValue RHS = Compare.getOperand(1);
+  bool Inv;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+  SDValue IntCR;
 
-  // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
-  int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
-  ISD::CondCode CC =
-    cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
-  EVT InputVT = LHS.getValueType();
-  if (InputVT != MVT::i32 && InputVT != MVT::i64)
-    return SDValue();
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
 
-  if (ConvOpts == SetccInGPROpts::ZExtInvert ||
-      ConvOpts == SetccInGPROpts::SExtInvert)
-    CC = ISD::getSetCCInverse(CC, true);
+  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
+                               InFlag).getValue(1);
 
-  bool Inputs32Bit = InputVT == MVT::i32;
-  if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) {
-    LHS = signExtendInputIfNeeded(LHS);
-    RHS = signExtendInputIfNeeded(RHS);
-  } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) {
-    LHS = zeroExtendInputIfNeeded(LHS);
-    RHS = zeroExtendInputIfNeeded(RHS);
-  }
+  IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                         CCReg), 0);
 
-  SDLoc dl(Compare);
-  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
-  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
-  bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
-    ConvOpts == SetccInGPROpts::SExtInvert;
+  SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
+                      getI32Imm(31, dl), getI32Imm(31, dl) };
+  if (!Inv) {
+    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    return true;
+  }
 
-  if (IsSext && Inputs32Bit)
-    return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
-  else if (Inputs32Bit)
-    return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
-  else if (IsSext)
-    return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
-  return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+  CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
+  return true;
 }
 
 /// Does this node represent a load/store node whose address can be represented
@@ -3016,8 +3831,18 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
     AddrOp = STN->getOperand(2);
 
   short Imm = 0;
-  if (AddrOp.getOpcode() == ISD::ADD)
+  if (AddrOp.getOpcode() == ISD::ADD) {
+    // If op0 is a frame index that is under aligned, we can't do it either,
+    // because it is translated to r31 or r1 + slot + offset. We won't know the
+    // slot number until the stack frame is finalized.
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(AddrOp.getOperand(0))) {
+      const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
+      unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+      if ((SlotAlign % Val) != 0)
+        return false;
+    }
     return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+  }
 
   // If the address comes from the outside, the offset will be zero.
   return AddrOp.getOpcode() == ISD::CopyFromReg;
@@ -3050,22 +3875,20 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   if (tryBitPermutation(N))
     return;
 
+  // Try to emit integer compares as GPR-only sequences (i.e. no use of CR).
+  if (tryIntCompareInGPR(N))
+    return;
+
   switch (N->getOpcode()) {
   default: break;
 
   case ISD::Constant:
     if (N->getValueType(0) == MVT::i64) {
-      ReplaceNode(N, getInt64(CurDAG, N));
+      ReplaceNode(N, selectI64Imm(CurDAG, N));
       return;
     }
     break;
 
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-    if (tryEXTEND(N))
-      return;
-    break;
-
   case ISD::SETCC:
     if (trySETCC(N))
       return;
@@ -3209,9 +4032,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::AND: {
-    if (tryLogicOpOfCompares(N))
-      return;
-
     unsigned Imm, Imm2, SH, MB, ME;
     uint64_t Imm64;
 
@@ -3331,9 +4151,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       if (tryBitfieldInsert(N))
         return;
 
-    if (tryLogicOpOfCompares(N))
-      return;
-
     int16_t Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
         isIntS16Immediate(N->getOperand(1), Imm)) {
@@ -3348,12 +4165,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       }
     }
 
+    // OR with a 32-bit immediate can be handled by ori + oris
+    // without creating an immediate in a GPR.
+    uint64_t Imm64 = 0;
+    bool IsPPC64 = PPCSubTarget->isPPC64();
+    if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
+        (Imm64 & ~0xFFFFFFFFuLL) == 0) {
+      // If ImmHi (ImmHi) is zero, only one ori (oris) is generated later.
+      uint64_t ImmHi = Imm64 >> 16;
+      uint64_t ImmLo = Imm64 & 0xFFFF;
+      if (ImmHi != 0 && ImmLo != 0) {
+        SDNode *Lo = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                            N->getOperand(0),
+                                            getI16Imm(ImmLo, dl));
+        SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)};
+        CurDAG->SelectNodeTo(N, PPC::ORIS8, MVT::i64, Ops1);
+        return;
+      }
+    }
+
     // Other cases are autogenerated.
     break;
   }
   case ISD::XOR: {
-    if (tryLogicOpOfCompares(N))
-      return;
+    // XOR with a 32-bit immediate can be handled by xori + xoris
+    // without creating an immediate in a GPR.
+    uint64_t Imm64 = 0;
+    bool IsPPC64 = PPCSubTarget->isPPC64();
+    if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
+        (Imm64 & ~0xFFFFFFFFuLL) == 0) {
+      // If ImmHi (ImmHi) is zero, only one xori (xoris) is generated later.
+      uint64_t ImmHi = Imm64 >> 16;
+      uint64_t ImmLo = Imm64 & 0xFFFF;
+      if (ImmHi != 0 && ImmLo != 0) {
+        SDNode *Lo = CurDAG->getMachineNode(PPC::XORI8, dl, MVT::i64,
+                                            N->getOperand(0),
+                                            getI16Imm(ImmLo, dl));
+        SDValue Ops1[] = { SDValue(Lo, 0), getI16Imm(ImmHi, dl)};
+        CurDAG->SelectNodeTo(N, PPC::XORIS8, MVT::i64, Ops1);
+        return;
+      }
+    }
+
     break;
   }
   case ISD::ADD: {
@@ -3666,9 +4519,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
 
     // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
     // If it must be toc-referenced according to PPCSubTarget, we generate:
-    //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+    //   LDtocL(@sym, ADDIStocHA(%x2, @sym))
     // Otherwise we generate:
-    //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+    //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
     SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index b3a3c73f6df03..18e567fa589c7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,6 +51,9 @@
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
@@ -82,11 +85,8 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -114,6 +114,8 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -226,6 +228,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::UREM, MVT::i64, Expand);
   }
 
+  if (Subtarget.hasP9Vector()) {
+    setOperationAction(ISD::ABS, MVT::v4i32, Legal);
+    setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+  }
+
   // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
@@ -283,14 +291,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
-  // PowerPC does not have BSWAP
+  // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
+  // to speed up scalar BSWAP64.
   // CTPOP or CTTZ were introduced in P8/P9 respectivelly
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
   if (Subtarget.isISA3_0()) {
+    setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
     setOperationAction(ISD::CTTZ , MVT::i32  , Legal);
     setOperationAction(ISD::CTTZ , MVT::i64  , Legal);
   } else {
+    setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
   }
@@ -773,6 +783,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
     }
+
+    if (Subtarget.hasP9Altivec()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+    }
   }
 
   if (Subtarget.hasQPX()) {
@@ -1131,7 +1146,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
-  case PPCISD::XXINSERT:        return "PPCISD::XXINSERT";
+  case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
   case PPCISD::XXREVERSE:       return "PPCISD::XXREVERSE";
   case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
@@ -2413,8 +2428,8 @@ static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
   SDValue Ops[] = { GA, Reg };
   return DAG.getMemIntrinsicNode(
       PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
-      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
-      false, 0);
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+      MachineMemOperand::MOLoad);
 }
 
 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
@@ -2470,7 +2485,6 @@ SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
     return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
 
   switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Default:
   case CodeModel::Small:
   case CodeModel::Medium:
     return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
@@ -2488,7 +2502,6 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Default:
   case CodeModel::Small:
   case CodeModel::Medium:
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
@@ -2560,7 +2573,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   bool is64bit = Subtarget.isPPC64();
-  const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+  const Module *M = DAG.getMachineFunction().getFunction().getParent();
   PICLevel::Level picLevel = M->getPICLevel();
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
@@ -3529,7 +3542,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
   unsigned &QFPR_idx = FPR_idx;
   SmallVector<SDValue, 8> MemOps;
-  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
@@ -3614,6 +3627,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+          FuncInfo->addLiveInAttr(VReg, Flags);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store;
 
@@ -3648,6 +3662,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
           break;
 
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        FuncInfo->addLiveInAttr(VReg, Flags);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
         SDValue Addr = FIN;
         if (j) {
@@ -3684,6 +3699,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
@@ -3729,6 +3745,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
         // since otherwise we never run out of FPRs before running out
         // of GPRs.
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::f32) {
@@ -3969,7 +3986,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
@@ -4251,13 +4268,25 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
 static bool isFunctionGlobalAddress(SDValue Callee);
 
 static bool
-resideInSameSection(const Function *Caller, SDValue Callee,
+callsShareTOCBase(const Function *Caller, SDValue Callee,
                     const TargetMachine &TM) {
   // If !G, Callee can be an external symbol.
   GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
   if (!G)
     return false;
 
+  // The medium and large code models are expected to provide a sufficiently
+  // large TOC to provide all data addressing needs of a module with a
+  // single TOC. Since each module will be addressed with a single TOC then we
+  // only need to check that caller and callee don't cross dso boundaries.
+  if (CodeModel::Medium == TM.getCodeModel() ||
+      CodeModel::Large == TM.getCodeModel())
+    return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
+
+  // Otherwise we need to ensure callee and caller are in the same section,
+  // since the linker may allocate multiple TOCs, and we don't know which
+  // sections will belong to the same TOC base.
+
   const GlobalValue *GV = G->getGlobal();
   if (!GV->isStrongDefinitionForLinker())
     return false;
@@ -4335,12 +4364,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
 }
 
 static bool
-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
-  if (CS->arg_size() != CallerFn->arg_size())
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
+  if (CS.arg_size() != CallerFn->arg_size())
     return false;
 
-  ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
-  ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+  ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
+  ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
   Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
 
   for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
@@ -4363,11 +4392,25 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
   return true;
 }
 
+// Returns true if TCO is possible between the callers and callees
+// calling conventions.
+static bool
+areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
+                                    CallingConv::ID CalleeCC) {
+  // Tail or Sibling call optimization (TCO/SCO) needs callee and caller to
+  // have the same calling convention.
+  if (CallerCC != CalleeCC)
+    return false;
+
+  // Tail or Sibling calls can be done with fastcc/ccc.
+  return (CallerCC == CallingConv::Fast || CallerCC == CallingConv::C);
+}
+
 bool
 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
                                     SDValue Callee,
                                     CallingConv::ID CalleeCC,
-                                    ImmutableCallSite *CS,
+                                    ImmutableCallSite CS,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -4379,15 +4422,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
   // Variadic argument functions are not supported.
   if (isVarArg) return false;
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
-
-  // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
-  // the same calling convention
-  if (CallerCC != CalleeCC) return false;
-
-  // SCO support C calling convention
-  if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+  auto &Caller = DAG.getMachineFunction().getFunction();
+  // Check that the calling conventions are compatible for tco.
+  if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
     return false;
 
   // Caller contains any byval parameter is not supported.
@@ -4406,11 +4443,10 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
       !isa<ExternalSymbolSDNode>(Callee))
     return false;
 
-  // Check if Callee resides in the same section, because for now, PPC64 SVR4
-  // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
-  // section.
+  // If the caller and callee potentially have different TOC bases then we
+  // cannot tail call since we need to restore the TOC pointer after the call.
   // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
-  if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine()))
+  if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
     return false;
 
   // TCO allows altering callee ABI, so we don't have to check further.
@@ -4422,7 +4458,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
   // If callee use the same argument list that caller is using, then we can
   // apply SCO on this case. If it is not, then we need to check if callee needs
   // stack for passing arguments.
-  if (!hasSameArgumentList(MF.getFunction(), CS) &&
+  if (!hasSameArgumentList(&Caller, CS) &&
       needStackSlotPassParameters(Subtarget, Outs)) {
     return false;
   }
@@ -4447,7 +4483,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
   if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
     // Functions containing by val parameters are not supported.
     for (unsigned i = 0; i != Ins.size(); i++) {
@@ -4676,7 +4712,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
             bool isPatchPoint, bool hasNest,
             SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
             SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
-            ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+            ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -4699,7 +4735,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
   // we're building with the leopard linker or later, which automatically
   // synthesizes these stubs.
   const TargetMachine &TM = DAG.getTarget();
-  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+  const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
     GV = G->getGlobal();
@@ -4787,7 +4823,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
                              MachineMemOperand::MOInvariant)
                           : MachineMemOperand::MONone;
 
-      MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+      MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
       SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
                                         /* Alignment = */ 8, MMOFlags);
 
@@ -4917,7 +4953,7 @@ SDValue PPCTargetLowering::FinishCall(
     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
     SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
     unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
-    SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
+    SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
@@ -4992,7 +5028,7 @@ SDValue PPCTargetLowering::FinishCall(
       // any other variadic arguments).
       Ops.insert(std::next(Ops.begin()), AddTOC);
     } else if (CallOpc == PPCISD::CALL &&
-      !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) {
+      !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
     }
@@ -5025,10 +5061,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
   bool isPatchPoint                     = CLI.IsPatchPoint;
-  ImmutableCallSite *CS                 = CLI.CS;
+  ImmutableCallSite CS                  = CLI.CS;
 
   if (isTailCall) {
-    if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall()))
+    if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
       isTailCall = false;
     else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
       isTailCall =
@@ -5056,7 +5092,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  if (!isTailCall && CS && CS->isMustTailCall())
+  if (!isTailCall && CS && CS.isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
@@ -5090,7 +5126,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite *CS) const {
+    ImmutableCallSite CS) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
@@ -5324,7 +5360,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite *CS) const {
+    ImmutableCallSite CS) const {
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
@@ -5974,7 +6010,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite *CS) const {
+    ImmutableCallSite CS) const {
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7448,9 +7484,11 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
 /// - The node is a "load-and-splat"
 /// In all other cases, we will choose to keep the BUILD_VECTOR.
 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
-                                            bool HasDirectMove) {
+                                            bool HasDirectMove,
+                                            bool HasP8Vector) {
   EVT VecVT = V->getValueType(0);
-  bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 ||
+  bool RightType = VecVT == MVT::v2f64 ||
+    (HasP8Vector && VecVT == MVT::v4f32) ||
     (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
   if (!RightType)
     return false;
@@ -7612,7 +7650,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // lowered to VSX instructions under certain conditions.
     // Without VSX, there is no pattern more efficient than expanding the node.
     if (Subtarget.hasVSX() &&
-        haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove()))
+        haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
+                                        Subtarget.hasP8Vector()))
       return Op;
     return SDValue();
   }
@@ -7646,6 +7685,15 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
         return DAG.getBitcast(Op.getValueType(), NewBV);
       return NewBV;
     }
+
+    // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll
+    // detect that constant splats like v8i16: 0xABAB are really just splats
+    // of a 1-byte constant. In this case, we need to convert the node to a
+    // splat of v16i8 and a bitcast.
+    if (Op.getValueType() != MVT::v16i8)
+      return DAG.getBitcast(Op.getValueType(),
+                            DAG.getConstant(SplatBits, dl, MVT::v16i8));
+
     return Op;
   }
 
@@ -7855,6 +7903,219 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
+/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
+                                           SelectionDAG &DAG) const {
+  const unsigned BytesInVector = 16;
+  bool IsLE = Subtarget.isLittleEndian();
+  SDLoc dl(N);
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned ShiftElts = 0, InsertAtByte = 0;
+  bool Swap = false;
+
+  // Shifts required to get the byte we want at element 7.
+  unsigned LittleEndianShifts[] = {8, 7,  6,  5,  4,  3,  2,  1,
+                                   0, 15, 14, 13, 12, 11, 10, 9};
+  unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
+                                1, 2,  3,  4,  5,  6,  7,  8};
+
+  ArrayRef<int> Mask = N->getMask();
+  int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  // For each mask element, find out if we're just inserting something
+  // from V2 into V1 or vice versa.
+  // Possible permutations inserting an element from V2 into V1:
+  //   X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  //   0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  //   ...
+  //   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
+  // Inserting from V1 into V2 will be similar, except mask range will be
+  // [16,31].
+
+  bool FoundCandidate = false;
+  // If both vector operands for the shuffle are the same vector, the mask
+  // will contain only elements from the first one and the second one will be
+  // undef.
+  unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
+  // Go through the mask of half-words to find an element that's being moved
+  // from one vector to the other.
+  for (unsigned i = 0; i < BytesInVector; ++i) {
+    unsigned CurrentElement = Mask[i];
+    // If 2nd operand is undefined, we should only look for element 7 in the
+    // Mask.
+    if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
+      continue;
+
+    bool OtherElementsInOrder = true;
+    // Examine the other elements in the Mask to see if they're in original
+    // order.
+    for (unsigned j = 0; j < BytesInVector; ++j) {
+      if (j == i)
+        continue;
+      // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
+      // from V2 [16,31] and vice versa.  Unless the 2nd operand is undefined,
+      // in which we always assume we're always picking from the 1st operand.
+      int MaskOffset =
+          (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
+      if (Mask[j] != OriginalOrder[j] + MaskOffset) {
+        OtherElementsInOrder = false;
+        break;
+      }
+    }
+    // If other elements are in original order, we record the number of shifts
+    // we need to get the element we want into element 7. Also record which byte
+    // in the vector we should insert into.
+    if (OtherElementsInOrder) {
+      // If 2nd operand is undefined, we assume no shifts and no swapping.
+      if (V2.isUndef()) {
+        ShiftElts = 0;
+        Swap = false;
+      } else {
+        // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
+        ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
+                         : BigEndianShifts[CurrentElement & 0xF];
+        Swap = CurrentElement < BytesInVector;
+      }
+      InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
+      FoundCandidate = true;
+      break;
+    }
+  }
+
+  if (!FoundCandidate)
+    return SDValue();
+
+  // Candidate found, construct the proper SDAG sequence with VINSERTB,
+  // optionally with VECSHL if shift is required.
+  if (Swap)
+    std::swap(V1, V2);
+  if (V2.isUndef())
+    V2 = V1;
+  if (ShiftElts) {
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
+                       DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  }
+  return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
+                     DAG.getConstant(InsertAtByte, dl, MVT::i32));
+}
+
+/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
+                                           SelectionDAG &DAG) const {
+  const unsigned NumHalfWords = 8;
+  const unsigned BytesInVector = NumHalfWords * 2;
+  // Check that the shuffle is on half-words.
+  if (!isNByteElemShuffleMask(N, 2, 1))
+    return SDValue();
+
+  bool IsLE = Subtarget.isLittleEndian();
+  SDLoc dl(N);
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned ShiftElts = 0, InsertAtByte = 0;
+  bool Swap = false;
+
+  // Shifts required to get the half-word we want at element 3.
+  unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
+  unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
+
+  uint32_t Mask = 0;
+  uint32_t OriginalOrderLow = 0x1234567;
+  uint32_t OriginalOrderHigh = 0x89ABCDEF;
+  // Now we look at mask elements 0,2,4,6,8,10,12,14.  Pack the mask into a
+  // 32-bit space, only need 4-bit nibbles per element.
+  for (unsigned i = 0; i < NumHalfWords; ++i) {
+    unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+    Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
+  }
+
+  // For each mask element, find out if we're just inserting something
+  // from V2 into V1 or vice versa.  Possible permutations inserting an element
+  // from V2 into V1:
+  //   X, 1, 2, 3, 4, 5, 6, 7
+  //   0, X, 2, 3, 4, 5, 6, 7
+  //   0, 1, X, 3, 4, 5, 6, 7
+  //   0, 1, 2, X, 4, 5, 6, 7
+  //   0, 1, 2, 3, X, 5, 6, 7
+  //   0, 1, 2, 3, 4, X, 6, 7
+  //   0, 1, 2, 3, 4, 5, X, 7
+  //   0, 1, 2, 3, 4, 5, 6, X
+  // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
+
+  bool FoundCandidate = false;
+  // Go through the mask of half-words to find an element that's being moved
+  // from one vector to the other.
+  for (unsigned i = 0; i < NumHalfWords; ++i) {
+    unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+    uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
+    uint32_t MaskOtherElts = ~(0xF << MaskShift);
+    uint32_t TargetOrder = 0x0;
+
+    // If both vector operands for the shuffle are the same vector, the mask
+    // will contain only elements from the first one and the second one will be
+    // undef.
+    if (V2.isUndef()) {
+      ShiftElts = 0;
+      unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
+      TargetOrder = OriginalOrderLow;
+      Swap = false;
+      // Skip if not the correct element or mask of other elements don't equal
+      // to our expected order.
+      if (MaskOneElt == VINSERTHSrcElem &&
+          (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+        InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+        FoundCandidate = true;
+        break;
+      }
+    } else { // If both operands are defined.
+      // Target order is [8,15] if the current mask is between [0,7].
+      TargetOrder =
+          (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
+      // Skip if mask of other elements don't equal our expected order.
+      if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+        // We only need the last 3 bits for the number of shifts.
+        ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
+                         : BigEndianShifts[MaskOneElt & 0x7];
+        InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+        Swap = MaskOneElt < NumHalfWords;
+        FoundCandidate = true;
+        break;
+      }
+    }
+  }
+
+  if (!FoundCandidate)
+    return SDValue();
+
+  // Candidate found, construct the proper SDAG sequence with VINSERTH,
+  // optionally with VECSHL if shift is required.
+  if (Swap)
+    std::swap(V1, V2);
+  if (V2.isUndef())
+    V2 = V1;
+  SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  if (ShiftElts) {
+    // Double ShiftElts because we're left shifting on v16i8 type.
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+                              DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
+    SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
+    SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+                              DAG.getConstant(InsertAtByte, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+  }
+  SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+  SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+                            DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+}
+
 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
 /// is a shuffle we can handle in a single instruction, return it.  Otherwise,
 /// return the code it can be lowered into.  Worst case, it can always be
@@ -7869,7 +8130,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   bool isLittleEndian = Subtarget.isLittleEndian();
 
   unsigned ShiftElts, InsertAtByte;
-  bool Swap;
+  bool Swap = false;
   if (Subtarget.hasP9Vector() &&
       PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
                            isLittleEndian)) {
@@ -7880,15 +8141,23 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     if (ShiftElts) {
       SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
                                 DAG.getConstant(ShiftElts, dl, MVT::i32));
-      SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+      SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
                                 DAG.getConstant(InsertAtByte, dl, MVT::i32));
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
     }
-    SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+    SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
                               DAG.getConstant(InsertAtByte, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+  if (Subtarget.hasP9Altivec()) {
+    SDValue NewISDNode;
+    if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
+      return NewISDNode;
+
+    if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
+      return NewISDNode;
+  }
 
   if (Subtarget.hasVSX() &&
       PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
@@ -8390,6 +8659,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   unsigned IntrinsicID =
     cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
+  SDLoc dl(Op);
+
   if (IntrinsicID == Intrinsic::thread_pointer) {
     // Reads the thread pointer register, used for __builtin_thread_pointer.
     if (Subtarget.isPPC64())
@@ -8397,9 +8668,37 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
+  // We are looking for absolute values here.
+  // The idea is to try to fit one of two patterns:
+  //  max (a, (0-a))  OR  max ((0-a), a)
+  if (Subtarget.hasP9Vector() &&
+      (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
+       IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
+       IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
+    SDValue V1 = Op.getOperand(1);
+    SDValue V2 = Op.getOperand(2);
+    if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
+        (V1.getSimpleValueType() == MVT::v4i32 ||
+         V1.getSimpleValueType() == MVT::v8i16 ||
+         V1.getSimpleValueType() == MVT::v16i8)) {
+      if ( V1.getOpcode() == ISD::SUB &&
+           ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
+           V1.getOperand(1) == V2 ) {
+        // Generate the abs instruction with the operands
+        return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
+      }
+
+      if ( V2.getOpcode() == ISD::SUB &&
+           ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
+           V2.getOperand(1) == V1 ) {
+        // Generate the abs instruction with the operands
+        return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
+      }
+    }
+  }
+
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
-  SDLoc dl(Op);
   int CompareOpc;
   bool isDot;
   if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
@@ -8495,6 +8794,23 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+// Lower scalar BSWAP64 to xxbrd.
+SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // MTVSRDD
+  Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
+                   Op.getOperand(0));
+  // XXBRD
+  Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
+  // MFVSRD
+  int VectorIndex = 0;
+  if (Subtarget.isLittleEndian())
+    VectorIndex = 1;
+  Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
+                   DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
+  return Op;
+}
+
 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -8539,11 +8855,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
          "Should only be called for ISD::INSERT_VECTOR_ELT");
+
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   // We have legal lowering for constant indices but not for variable ones.
-  if (C)
-    return Op;
-  return SDValue();
+  if (!C)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+    unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+    unsigned InsertAtElement = C->getZExtValue();
+    unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+    if (Subtarget.isLittleEndian()) {
+      InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+    }
+    return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+                       DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  }
+  return Op;
 }
 
 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -8966,6 +9300,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SREM:
   case ISD::UREM:
     return LowerREM(Op, DAG);
+  case ISD::BSWAP:
+    return LowerBSWAP(Op, DAG);
   }
 }
 
@@ -9461,7 +9797,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   // Naked functions never have a base pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned BaseReg;
-  if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
+  if (MF->getFunction().hasFnAttribute(Attribute::Naked))
     BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
   else
     BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
@@ -11887,9 +12223,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
 
+    // STORE Constant:i32<0>  ->  STORE<trunc to i32> Constant:i64<0>
+    // So it can increase the chance of CSE constant construction.
+    EVT VT = N->getOperand(1).getValueType();
+    if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
+        isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
+      // Need to sign-extended to 64-bits to handle negative values.
+      EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
+      uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
+                                    MemVT.getSizeInBits());
+      SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
+
+      // DAG.getTruncStore() can't be used here because it doesn't accept
+      // the general (base + offset) addressing mode.
+      // So we use UpdateNodeOperands and setTruncatingStore instead.
+      DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
+                             N->getOperand(3));
+      cast<StoreSDNode>(N)->setTruncatingStore(true);
+      return SDValue(N, 0);
+    }
+
     // For little endian, VSX stores require generating xxswapd/lxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
-    EVT VT = N->getOperand(1).getValueType();
     if (VT.isSimple()) {
       MVT StoreVT = VT.getSimpleVT();
       if (Subtarget.needsSwapsForVSXMemOps() &&
@@ -12690,6 +13045,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &PPC::QSRCRegClass);
       if (Subtarget.hasAltivec())
         return std::make_pair(0U, &PPC::VRRCRegClass);
+      break;
     case 'y':   // crrc
       return std::make_pair(0U, &PPC::CRRCRegClass);
     }
@@ -12810,7 +13166,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
-                                              unsigned AS) const {
+                                              unsigned AS, Instruction *I) const {
   // PPC does not allow r+i addressing modes for vectors!
   if (Ty->isVectorTy() && AM.BaseOffs != 0)
     return false;
@@ -12895,7 +13251,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   // Naked functions never have a frame pointer, and so we use r1. For all
   // other functions, this decision must be delayed until during PEI.
   unsigned FrameReg;
-  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+  if (MF.getFunction().hasFnAttribute(Attribute::Naked))
     FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
   else
     FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
@@ -12940,6 +13296,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
+                                           MachineFunction &MF,
                                            unsigned Intrinsic) const {
   switch (Intrinsic) {
   case Intrinsic::ppc_qpx_qvlfd:
@@ -12992,9 +13349,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
     Info.align = 1;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
+    Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::ppc_qpx_qvlfda:
@@ -13028,9 +13383,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.offset = 0;
     Info.size = VT.getStoreSize();
     Info.align = 1;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
+    Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::ppc_qpx_qvstfd:
@@ -13082,9 +13435,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
     Info.align = 1;
-    Info.vol = false;
-    Info.readMem = false;
-    Info.writeMem = true;
+    Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::ppc_qpx_qvstfda:
@@ -13117,9 +13468,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.offset = 0;
     Info.size = VT.getStoreSize();
     Info.align = 1;
-    Info.vol = false;
-    Info.readMem = false;
-    Info.writeMem = true;
+    Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   default:
@@ -13146,12 +13495,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
   if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
-    const Function *F = MF.getFunction();
+    const Function &F = MF.getFunction();
     // When expanding a memset, require at least two QPX instructions to cover
     // the cost of loading the value to be stored from the constant pool.
     if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
        (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
-        !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+        !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
       return MVT::v4f64;
     }
 
@@ -13216,8 +13565,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return TargetLowering::isZExtFree(Val, VT2);
 }
 
-bool PPCTargetLowering::isFPExtFree(EVT VT) const {
-  assert(VT.isFloatingPoint());
+bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
+  assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
+         "invalid fpext types");
   return true;
 }
 
@@ -13369,7 +13719,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
-    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+    assert(Entry->getParent()->getFunction().hasFnAttribute(
              Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
@@ -13467,3 +13817,38 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
 
   return SDValue();
 }
+
+bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+  // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
+  if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
+    return false;
+
+  // If not a tail call then no need to proceed.
+  if (!CI->isTailCall())
+    return false;
+
+  // If tail calls are disabled for the caller then we are done.
+  const Function *Caller = CI->getParent()->getParent();
+  auto Attr = Caller->getFnAttribute("disable-tail-calls");
+  if (Attr.getValueAsString() == "true")
+    return false;
+
+  // If sibling calls have been disabled and tail-calls aren't guaranteed
+  // there is no reason to duplicate.
+  auto &TM = getTargetMachine();
+  if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
+    return false;
+
+  // Can't tail call a function called indirectly, or if it has variadic args.
+  const Function *Callee = CI->getCalledFunction();
+  if (!Callee || Callee->isVarArg())
+    return false;
+
+  // Make sure the callee and caller calling conventions are eligible for tco.
+  if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
+                                           CI->getCallingConv()))
+      return false;
+
+  // If the function is local then we have a good chance at tail-calling it
+  return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 49d7d8220af16..b119e5b4a5649 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
@@ -30,13 +31,20 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Target/TargetLowering.h"
 #include <utility>
 
 namespace llvm {
 
   namespace PPCISD {
 
+    // When adding a NEW PPCISD node please add it to the correct position in
+    // the enum. The order of elements in this enum matters!
+    // Values that are added after this entry:
+    //     STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE
+    // are considerd memory opcodes and are treated differently than entries
+    // that come before it. For example, ADD or MUL should be placed before
+    // the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come
+    // after it.
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -86,15 +94,15 @@ namespace llvm {
       ///
       XXSPLT,
 
-      /// XXINSERT - The PPC VSX insert instruction
+      /// VECINSERT - The PPC vector insert instruction
       ///
-      XXINSERT,
+      VECINSERT,
 
       /// XXREVERSE - The PPC VSX reverse instruction
       ///
       XXREVERSE,
 
-      /// VECSHL - The PPC VSX shift left instruction
+      /// VECSHL - The PPC vector shift left instruction
       ///
       VECSHL,
 
@@ -254,7 +262,7 @@ namespace llvm {
       /// local dynamic TLS on PPC32.
       PPC32_PICGOT,
 
-      /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+      /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
       /// TLS model, produces an ADDIS8 instruction that adds the GOT
       /// base to sym\@got\@tprel\@ha.
       ADDIS_GOT_TPREL_HA,
@@ -273,18 +281,18 @@ namespace llvm {
       /// TLS sequence.
       ADD_TLS,
 
-      /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+      /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
       /// register to sym\@got\@tlsgd\@ha.
       ADDIS_TLSGD_HA,
 
-      /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
       /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
       /// ADDIS_TLSGD_L_ADDR until after register assignment.
       ADDI_TLSGD_L,
 
-      /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
       /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
       /// ADDIS_TLSGD_L_ADDR until after register assignment.
       GET_TLS_ADDR,
@@ -294,18 +302,18 @@ namespace llvm {
       /// register assignment.
       ADDI_TLSGD_L_ADDR,
 
-      /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+      /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
       /// register to sym\@got\@tlsld\@ha.
       ADDIS_TLSLD_HA,
 
-      /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
       /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
       /// ADDIS_TLSLD_L_ADDR until after register assignment.
       ADDI_TLSLD_L,
 
-      /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
       /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
       /// ADDIS_TLSLD_L_ADDR until after register assignment.
       GET_TLSLD_ADDR,
@@ -315,7 +323,7 @@ namespace llvm {
       /// following register assignment.
       ADDI_TLSLD_L_ADDR,
 
-      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS
+      /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds X3 to
       /// sym\@dtprel\@ha.
       ADDIS_DTPREL_HA,
@@ -578,8 +586,8 @@ namespace llvm {
 
     bool supportSplitCSR(MachineFunction *MF) const override {
       return
-        MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
-        MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+        MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+        MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
     }
 
     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
@@ -727,7 +735,8 @@ namespace llvm {
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
-                               Type *Ty, unsigned AS) const override;
+                               Type *Ty, unsigned AS,
+                               Instruction *I = nullptr) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -749,14 +758,14 @@ namespace llvm {
 
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-    bool isFPExtFree(EVT VT) const override;
+    bool isFPExtFree(EVT DestVT, EVT SrcVT) const override;
 
     /// \brief Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
-    bool convertSelectOfConstantsToMath() const override {
+    bool convertSelectOfConstantsToMath(EVT VT) const override {
       return true;
     }
 
@@ -764,6 +773,7 @@ namespace llvm {
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
                             const CallInst &I,
+                            MachineFunction &MF,
                             unsigned Intrinsic) const override;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
@@ -898,7 +908,7 @@ namespace llvm {
     IsEligibleForTailCallOptimization_64SVR4(
                                     SDValue Callee,
                                     CallingConv::ID CalleeCC,
-                                    ImmutableCallSite *CS,
+                                    ImmutableCallSite CS,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -944,6 +954,7 @@ namespace llvm {
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -964,7 +975,7 @@ namespace llvm {
                        SDValue &Callee, int SPDiff, unsigned NumBytes,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SmallVectorImpl<SDValue> &InVals,
-                       ImmutableCallSite *CS) const;
+                       ImmutableCallSite CS) const;
 
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1015,7 +1026,7 @@ namespace llvm {
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite *CS) const;
+                             ImmutableCallSite CS) const;
     SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool isTailCall, bool isPatchPoint,
@@ -1024,7 +1035,7 @@ namespace llvm {
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite *CS) const;
+                             ImmutableCallSite CS) const;
     SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool isTailCall, bool isPatchPoint,
@@ -1033,7 +1044,7 @@ namespace llvm {
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite *CS) const;
+                             ImmutableCallSite CS) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -1063,7 +1074,23 @@ namespace llvm {
     SDValue
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
-  };
+
+    /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the VINSERTH instruction introduced in ISA 3.0. This is
+    /// essentially any shuffle of v8i16 vectors that just inserts one element
+    /// from one vector into the other.
+    SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
+    /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the VINSERTB instruction introduced in ISA 3.0. This is
+    /// essentially v16i8 vector version of VINSERTH.
+    SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
+    // Return whether the call instruction can potentially be optimized to a
+    // tail call. This will cause the optimizers to attempt to move, or
+    // duplicate return instructions to help enable tail call optimizations.
+    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+  }; // end class PPCTargetLowering
 
   namespace PPC {
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index e2af5e5295445..fdd28c2ff03f2 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -194,6 +194,11 @@ def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
           (BL8_NOP texternalsym:$dst)>;
 
 // Atomic operations
+// FIXME: some of these might be used with constant operands. This will result
+// in constant materialization instructions that may be redundant. We currently
+// clean this up in PPCMIPeephole with calls to
+// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
+// in the first place.
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I64 : Pseudo<
@@ -642,8 +647,13 @@ def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
                          "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+
+defm EXTSWSLI : XSForm_1r<31, 445, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+                          "extswsli", "$rA, $rS, $SH", IIC_IntRotateDI,
+                          []>, isPPC64;
+
 // For fast-isel:
-let isCodeGenOnly = 1 in
+let isCodeGenOnly = 1, Defs = [CARRY] in
 def SRADI_32  : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
                          "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
 
@@ -673,6 +683,9 @@ def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
                        "popcntw $rA, $rS", IIC_IntGeneral,
                        [(set i32:$rA, (ctpop i32:$rS))]>;
 
+def POPCNTB : XForm_11<31, 122, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntb $rA, $rS", IIC_IntGeneral, []>;
+
 defm DIVD  : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                           "divd", "$rT, $rA, $rB", IIC_IntDivD,
                           [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64;
@@ -685,6 +698,18 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                      isPPC64, Requires<[HasExtDiv]>;
 
 let Predicates = [IsISA3_0] in {
+def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+                       "maddhd $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def MADDHDU : VAForm_1a<49, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+                       "maddhdu $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def MADDLD : VAForm_1a<51, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC),
+                       "maddld $RT, $RA, $RB, $RC", IIC_IntMulHD, []>, isPPC64;
+def SETB : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA),
+                     "setb $RT, $BFA", IIC_IntGeneral>, isPPC64;
+def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L),
+                     "darn $RT, $L", IIC_LdStLD>, isPPC64;
+def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D),
+                     "addpcis $RT, $D", IIC_BrB, []>, isPPC64;
 def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "modsd $rT, $rA, $rB", IIC_IntDivW,
                         [(set i64:$rT, (srem i64:$rA, i64:$rB))]>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 5465b5f2d66cd..e751c149b0b32 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -477,10 +477,10 @@ def VPERM      : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
 def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
 
 // Shuffles.
-def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH),
                        "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
-                       [(set v16i8:$vD, 
-                         (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+                       [(set v16i8:$vD,
+                         (PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>;
 
 // VX-Form instructions.  AltiVec arithmetic ops.
 let isCommutable = 1 in {
@@ -908,6 +908,9 @@ def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
         (VPKUWUM $vA, $vA)>;
 def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
         (VPKUHUM $vA, $vA)>;
+def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB),
+        (VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>;
+
 
 // Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
 // These fragments are matched for little-endian, where the inputs must
@@ -1309,8 +1312,18 @@ def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
 def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
 
 // Vector Insert Element Instructions
-def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
-def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTB : VXForm_1<781, (outs vrrc:$vD),
+                        (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+                        "vinsertb $vD, $vB, $UIM", IIC_VecGeneral,
+                        [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB,
+                                                      imm32SExt16:$UIM))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
+                        (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+                        "vinserth $vD, $vB, $UIM", IIC_VecGeneral,
+                        [(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB,
+                                                      imm32SExt16:$UIM))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
 def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
 
@@ -1488,4 +1501,19 @@ def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
 def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                        "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
                        [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
+
+def : Pat<(v16i8:$vD (abs v16i8:$vA)),
+          (v16i8 (VABSDUB $vA, (V_SET0B)))>;
+def : Pat<(v8i16:$vD (abs v8i16:$vA)),
+          (v8i16 (VABSDUH $vA, (V_SET0H)))>;
+def : Pat<(v4i32:$vD (abs v4i32:$vA)),
+          (v4i32 (VABSDUW $vA, (V_SET0)))>;
+
+def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
+          (v16i8 (VABSDUB $vA, $vB))>;
+def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
+          (v8i16 (VABSDUH $vA, $vB))>;
+def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
+          (v4i32 (VABSDUW $vA, $vB))>;
+
 } // end HasP9Altivec
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index ef7d2012a2332..f2845415ecb5a 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -386,6 +386,22 @@ class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
   let Inst{30-31} = xo;
 }
 
+// ISA V3.0B 1.6.6 DX-Form
+class DXForm<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+             InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<16> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = D{5-1};  // d1
+  let Inst{16-25} = D{15-6}; // d0
+  let Inst{26-30} = xo;
+  let Inst{31}    = D{0};    // d2
+}
+
 // DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO]
 class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
                       string asmstr, InstrItinClass itin, list<dag> pattern>
@@ -725,6 +741,96 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class XForm_44<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<3> BFA;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_45<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<2> L;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-13} = 0;
+  let Inst{14-15} = L;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_FRT5_XO2_XO3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2, bits<10> xo,
+                         dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                         list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-12} = xo1;
+  let Inst{13-15} = xo2;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_FRT5_XO2_XO3_FRB5_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+                              bits<10> xo, dag OOL, dag IOL, string asmstr,
+                              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FRB;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-12} = xo1;
+  let Inst{13-15} = xo2;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_FRT5_XO2_XO3_DRM3_XO10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+                              bits<10> xo, dag OOL, dag IOL, string asmstr,
+                              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<3> DRM;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-12} = xo1;
+  let Inst{13-15} = xo2;
+  let Inst{16-17} = 0;
+  let Inst{18-20} = DRM;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_FRT5_XO2_XO3_RM2_X10<bits<6> opcode, bits<2> xo1, bits<3> xo2,
+                            bits<10> xo, dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<2> RM;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-12} = xo1;
+  let Inst{13-15} = xo2;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = RM;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
 class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -1995,4 +2101,5 @@ class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
   let PPC64 = 0;
   let Pattern = pattern;
   let Inst{31-0} = 0;
+  let hasNoSchedulingInfo = 1;
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index e74ba38c351f0..ffb5cc8757f25 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -20,7 +20,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -46,6 +46,16 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
+STATISTIC(NumStoreSPILLVSRRCAsVec,
+          "Number of spillvsrrc spilled to stack as vec");
+STATISTIC(NumStoreSPILLVSRRCAsGpr,
+          "Number of spillvsrrc spilled to stack as gpr");
+STATISTIC(NumGPRtoVSRSpill, "Number of gpr spills to spillvsrrc");
+STATISTIC(CmpIselsConverted,
+          "Number of ISELs that depend on comparison of constants converted");
+STATISTIC(MissedConvertibleImmediateInstrs,
+          "Number of compare-immediate instructions fed by constants");
+
 static cl::
 opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
             cl::desc("Disable analysis for CTR loops"));
@@ -254,6 +264,7 @@ bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   switch (MI.getOpcode()) {
   default: return false;
   case PPC::EXTSW:
+  case PPC::EXTSW_32:
   case PPC::EXTSW_32_64:
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
@@ -275,11 +286,12 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case PPC::RESTORE_CRBIT:
   case PPC::LVX:
   case PPC::LXVD2X:
-  case PPC::LXVX:
+  case PPC::LXV:
   case PPC::QVLFDX:
   case PPC::QVLFSXs:
   case PPC::QVLFDXb:
   case PPC::RESTORE_VRSAVE:
+  case PPC::SPILLTOVSR_LD:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
     if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -328,11 +340,12 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case PPC::SPILL_CRBIT:
   case PPC::STVX:
   case PPC::STXVD2X:
-  case PPC::STXVX:
+  case PPC::STXV:
   case PPC::QVSTFDX:
   case PPC::QVSTFSXs:
   case PPC::QVSTFDXb:
   case PPC::SPILL_VRSAVE:
+  case PPC::SPILLTOVSR_ST:
     // Check for the operands added by addFrameReference (the immediate is the
     // offset which defaults to 0).
     if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
@@ -486,6 +499,20 @@ bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (!isUnpredicatedTerminator(*I))
     return false;
 
+  if (AllowModify) {
+    // If the BB ends with an unconditional branch to the fallthrough BB,
+    // we eliminate the branch instruction.
+    if (I->getOpcode() == PPC::B &&
+        MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+      I->eraseFromParent();
+
+      // We update iterator after deleting the last branch.
+      I = MBB.getLastNonDebugInstr();
+      if (I == MBB.end() || !isUnpredicatedTerminator(*I))
+        return false;
+    }
+  }
+
   // Get the last instruction in the block.
   MachineInstr &LastInst = *I;
 
@@ -917,7 +944,18 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
     getKillRegState(KillSrc);
     return;
-   }
+  } else if (PPC::G8RCRegClass.contains(SrcReg) &&
+             PPC::VSFRCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::MTVSRD), DestReg).addReg(SrcReg);
+    NumGPRtoVSRSpill++;
+    getKillRegState(KillSrc);
+    return;
+  } else if (PPC::VSFRCRegClass.contains(SrcReg) &&
+             PPC::G8RCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::MFVSRD), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
+    return;
+  }
 
   unsigned Opc;
   if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
@@ -1015,7 +1053,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X;
+    unsigned Op = Subtarget.hasP9Vector() ? PPC::STXV : PPC::STXVD2X;
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
@@ -1061,6 +1099,11 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                getKillRegState(isKill)),
                                        FrameIdx));
     NonRI = true;
+  } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_ST))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -1148,7 +1191,7 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                                        FrameIdx));
     NonRI = true;
   } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-    unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X;
+    unsigned Op = Subtarget.hasP9Vector() ? PPC::LXV : PPC::LXVD2X;
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
                                        FrameIdx));
     NonRI = true;
@@ -1182,6 +1225,9 @@ bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
                                        FrameIdx));
     NonRI = true;
+  } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILLTOVSR_LD),
+                                               DestReg), FrameIdx));
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -1592,37 +1638,20 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI) return false;
-  int MIOpC = MI->getOpcode();
 
   bool equalityOnly = false;
   bool noSub = false;
   if (isPPC64) {
     if (is32BitSignedCompare) {
       // We can perform this optimization only if MI is sign-extending.
-      if (MIOpC == PPC::SRAW  || MIOpC == PPC::SRAWo ||
-          MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo ||
-          MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo ||
-          MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo ||
-          MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) {
+      if (isSignExtended(*MI))
         noSub = true;
-      } else
+      else
         return false;
     } else if (is32BitUnsignedCompare) {
-      // 32-bit rotate and mask instructions are zero extending only if MB <= ME
-      bool isZeroExtendingRotate  =
-          (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo ||
-           MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo)
-          && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm();
-
       // We can perform this optimization, equality only, if MI is
       // zero-extending.
-      // FIXME: Other possible target instructions include ANDISo and
-      //        RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI.
-      if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
-          MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
-          MIOpC == PPC::SRW    || MIOpC == PPC::SRWo ||
-          MIOpC == PPC::ANDIo  ||
-          isZeroExtendingRotate) {
+      if (isZeroExtended(*MI)) {
         noSub = true;
         equalityOnly = true;
       } else
@@ -1640,8 +1669,10 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
          I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
-        unsigned Pred = UseMI->getOperand(0).getImm();
-        if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+        PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+        unsigned PredCond = PPC::getPredicateCondition(Pred);
+        // We ignore hint bits when checking for non-equality comparisons.
+        if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
           return false;
       } else if (UseMI->getOpcode() == PPC::ISEL ||
                  UseMI->getOpcode() == PPC::ISEL8) {
@@ -1688,34 +1719,47 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   else if (MI->getParent() != CmpInstr.getParent())
     return false;
   else if (Value != 0) {
-    // The record-form instructions set CR bit based on signed comparison against 0.
-    // We try to convert a compare against 1 or -1 into a compare against 0.
-    bool Success = false;
-    if (!equalityOnly && MRI->hasOneUse(CRReg)) {
-      MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
-      if (UseMI->getOpcode() == PPC::BCC) {
-        PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
-        int16_t Immed = (int16_t)Value;
-
-        if (Immed == -1 && Pred == PPC::PRED_GT) {
-          // We convert "greater than -1" into "greater than or equal to 0",
-          // since we are assuming signed comparison by !equalityOnly
-          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
-                                  PPC::PRED_GE));
-          Success = true;
-        }
-        else if (Immed == 1 && Pred == PPC::PRED_LT) {
-          // We convert "less than 1" into "less than or equal to 0".
-          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
-                                  PPC::PRED_LE));
-          Success = true;
-        }
-      }
-    }
+    // The record-form instructions set CR bit based on signed comparison
+    // against 0. We try to convert a compare against 1 or -1 into a compare
+    // against 0 to exploit record-form instructions. For example, we change
+    // the condition "greater than -1" into "greater than or equal to 0"
+    // and "less than 1" into "less than or equal to 0".
+
+    // Since we optimize comparison based on a specific branch condition,
+    // we don't optimize if condition code is used by more than once.
+    if (equalityOnly || !MRI->hasOneUse(CRReg))
+      return false;
+
+    MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
+    if (UseMI->getOpcode() != PPC::BCC)
+      return false;
 
-    // PPC does not have a record-form SUBri.
-    if (!Success)
+    PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+    PPC::Predicate NewPred = Pred;
+    unsigned PredCond = PPC::getPredicateCondition(Pred);
+    unsigned PredHint = PPC::getPredicateHint(Pred);
+    int16_t Immed = (int16_t)Value;
+
+    // When modyfing the condition in the predicate, we propagate hint bits
+    // from the original predicate to the new one.
+    if (Immed == -1 && PredCond == PPC::PRED_GT)
+      // We convert "greater than -1" into "greater than or equal to 0",
+      // since we are assuming signed comparison by !equalityOnly
+      NewPred = PPC::getPredicate(PPC::PRED_GE, PredHint);
+    else if (Immed == -1 && PredCond == PPC::PRED_LE)
+      // We convert "less than or equal to -1" into "less than 0".
+      NewPred = PPC::getPredicate(PPC::PRED_LT, PredHint);
+    else if (Immed == 1 && PredCond == PPC::PRED_LT)
+      // We convert "less than 1" into "less than or equal to 0".
+      NewPred = PPC::getPredicate(PPC::PRED_LE, PredHint);
+    else if (Immed == 1 && PredCond == PPC::PRED_GE)
+      // We convert "greater than or equal to 1" into "greater than 0".
+      NewPred = PPC::getPredicate(PPC::PRED_GT, PredHint);
+    else
       return false;
+
+    PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                            NewPred));
   }
 
   // Search for Sub.
@@ -1763,7 +1807,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (!MI) MI = Sub;
 
   int NewOpC = -1;
-  MIOpC = MI->getOpcode();
+  int MIOpC = MI->getOpcode();
   if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8)
     NewOpC = MIOpC;
   else {
@@ -1804,9 +1848,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
+        unsigned PredCond = PPC::getPredicateCondition(Pred);
         assert((!equalityOnly ||
-                Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
+                PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE) &&
                "Invalid predicate for equality-only optimization");
+        (void)PredCond; // To suppress warning in release build.
         PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
                                 PPC::getSwappedPredicate(Pred)));
       } else if (UseMI->getOpcode() == PPC::ISEL ||
@@ -1935,29 +1981,13 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
-bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  auto &MBB = *MI.getParent();
-  auto DL = MI.getDebugLoc();
-  switch (MI.getOpcode()) {
-  case TargetOpcode::LOAD_STACK_GUARD: {
-    assert(Subtarget.isTargetLinux() &&
-           "Only Linux target is expected to contain LOAD_STACK_GUARD");
-    const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
-    const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
-    MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
-    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-        .addImm(Offset)
-        .addReg(Reg);
-    return true;
-  }
-  case PPC::DFLOADf32:
-  case PPC::DFLOADf64:
-  case PPC::DFSTOREf32:
-  case PPC::DFSTOREf64: {
-    assert(Subtarget.hasP9Vector() &&
-           "Invalid D-Form Pseudo-ops on non-P9 target.");
-    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
-           "D-form op must have register and immediate operands");
+// Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction.
+// The VSX versions have the advantage of a full 64-register target whereas
+// the FP ones have the advantage of lower latency and higher throughput. So
+// what we are after is using the faster instructions in low register pressure
+// situations and using the larger register file in high register pressure
+// situations.
+bool PPCInstrInfo::expandVSXMemPseudo(MachineInstr &MI) const {
     unsigned UpperOpcode, LowerOpcode;
     switch (MI.getOpcode()) {
     case PPC::DFLOADf32:
@@ -1976,7 +2006,38 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       UpperOpcode = PPC::STXSD;
       LowerOpcode = PPC::STFD;
       break;
+    case PPC::XFLOADf32:
+      UpperOpcode = PPC::LXSSPX;
+      LowerOpcode = PPC::LFSX;
+      break;
+    case PPC::XFLOADf64:
+      UpperOpcode = PPC::LXSDX;
+      LowerOpcode = PPC::LFDX;
+      break;
+    case PPC::XFSTOREf32:
+      UpperOpcode = PPC::STXSSPX;
+      LowerOpcode = PPC::STFSX;
+      break;
+    case PPC::XFSTOREf64:
+      UpperOpcode = PPC::STXSDX;
+      LowerOpcode = PPC::STFDX;
+      break;
+    case PPC::LIWAX:
+      UpperOpcode = PPC::LXSIWAX;
+      LowerOpcode = PPC::LFIWAX;
+      break;
+    case PPC::LIWZX:
+      UpperOpcode = PPC::LXSIWZX;
+      LowerOpcode = PPC::LFIWZX;
+      break;
+    case PPC::STIWX:
+      UpperOpcode = PPC::STXSIWX;
+      LowerOpcode = PPC::STFIWX;
+      break;
+    default:
+      llvm_unreachable("Unknown Operation!");
     }
+
     unsigned TargetReg = MI.getOperand(0).getReg();
     unsigned Opcode;
     if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
@@ -1986,7 +2047,95 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       Opcode = UpperOpcode;
     MI.setDesc(get(Opcode));
     return true;
+}
+
+bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  auto &MBB = *MI.getParent();
+  auto DL = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::LOAD_STACK_GUARD: {
+    assert(Subtarget.isTargetLinux() &&
+           "Only Linux target is expected to contain LOAD_STACK_GUARD");
+    const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+    const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
+    MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(Offset)
+        .addReg(Reg);
+    return true;
   }
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64: {
+    assert(Subtarget.hasP9Vector() &&
+           "Invalid D-Form Pseudo-ops on Pre-P9 target.");
+    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+           "D-form op must have register and immediate operands");
+    return expandVSXMemPseudo(MI);
+  }
+  case PPC::XFLOADf32:
+  case PPC::XFSTOREf32:
+  case PPC::LIWAX:
+  case PPC::LIWZX:
+  case PPC::STIWX: {
+    assert(Subtarget.hasP8Vector() &&
+           "Invalid X-Form Pseudo-ops on Pre-P8 target.");
+    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
+           "X-form op must have register and register operands");
+    return expandVSXMemPseudo(MI);
+  }
+  case PPC::XFLOADf64:
+  case PPC::XFSTOREf64: {
+    assert(Subtarget.hasVSX() &&
+           "Invalid X-Form Pseudo-ops on target that has no VSX.");
+    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isReg() &&
+           "X-form op must have register and register operands");
+    return expandVSXMemPseudo(MI);
+  }
+  case PPC::SPILLTOVSR_LD: {
+    unsigned TargetReg = MI.getOperand(0).getReg();
+    if (PPC::VSFRCRegClass.contains(TargetReg)) {
+      MI.setDesc(get(PPC::DFLOADf64));
+      return expandPostRAPseudo(MI);
+    }
+    else
+      MI.setDesc(get(PPC::LD));
+    return true;
+  }
+  case PPC::SPILLTOVSR_ST: {
+    unsigned SrcReg = MI.getOperand(0).getReg();
+    if (PPC::VSFRCRegClass.contains(SrcReg)) {
+      NumStoreSPILLVSRRCAsVec++;
+      MI.setDesc(get(PPC::DFSTOREf64));
+      return expandPostRAPseudo(MI);
+    } else {
+      NumStoreSPILLVSRRCAsGpr++;
+      MI.setDesc(get(PPC::STD));
+    }
+    return true;
+  }
+  case PPC::SPILLTOVSR_LDX: {
+    unsigned TargetReg = MI.getOperand(0).getReg();
+    if (PPC::VSFRCRegClass.contains(TargetReg))
+      MI.setDesc(get(PPC::LXSDX));
+    else
+      MI.setDesc(get(PPC::LDX));
+    return true;
+  }
+  case PPC::SPILLTOVSR_STX: {
+    unsigned SrcReg = MI.getOperand(0).getReg();
+    if (PPC::VSFRCRegClass.contains(SrcReg)) {
+      NumStoreSPILLVSRRCAsVec++;
+      MI.setDesc(get(PPC::STXSDX));
+    } else {
+      NumStoreSPILLVSRRCAsGpr++;
+      MI.setDesc(get(PPC::STDX));
+    }
+    return true;
+  }
+
   case PPC::CFENCE8: {
     auto Val = MI.getOperand(0).getReg();
     BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
@@ -2002,6 +2151,829 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return false;
 }
 
+unsigned PPCInstrInfo::lookThruCopyLike(unsigned SrcReg,
+                                        const MachineRegisterInfo *MRI) {
+  while (true) {
+    MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    if (!MI->isCopyLike())
+      return SrcReg;
+
+    unsigned CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+      return CopySrcReg;
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+// Essentially a compile-time implementation of a compare->isel sequence.
+// It takes two constants to compare, along with the true/false registers
+// and the comparison type (as a subreg to a CR field) and returns one
+// of the true/false registers, depending on the comparison results.
+static unsigned selectReg(int64_t Imm1, int64_t Imm2, unsigned CompareOpc,
+                          unsigned TrueReg, unsigned FalseReg,
+                          unsigned CRSubReg) {
+  // Signed comparisons. The immediates are assumed to be sign-extended.
+  if (CompareOpc == PPC::CMPWI || CompareOpc == PPC::CMPDI) {
+    switch (CRSubReg) {
+    default: llvm_unreachable("Unknown integer comparison type.");
+    case PPC::sub_lt:
+      return Imm1 < Imm2 ? TrueReg : FalseReg;
+    case PPC::sub_gt:
+      return Imm1 > Imm2 ? TrueReg : FalseReg;
+    case PPC::sub_eq:
+      return Imm1 == Imm2 ? TrueReg : FalseReg;
+    }
+  }
+  // Unsigned comparisons.
+  else if (CompareOpc == PPC::CMPLWI || CompareOpc == PPC::CMPLDI) {
+    switch (CRSubReg) {
+    default: llvm_unreachable("Unknown integer comparison type.");
+    case PPC::sub_lt:
+      return (uint64_t)Imm1 < (uint64_t)Imm2 ? TrueReg : FalseReg;
+    case PPC::sub_gt:
+      return (uint64_t)Imm1 > (uint64_t)Imm2 ? TrueReg : FalseReg;
+    case PPC::sub_eq:
+      return Imm1 == Imm2 ? TrueReg : FalseReg;
+    }
+  }
+  return PPC::NoRegister;
+}
+
+// Replace an instruction with one that materializes a constant (and sets
+// CR0 if the original instruction was a record-form instruction).
+void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI,
+                                      const LoadImmediateInfo &LII) const {
+  // Remove existing operands.
+  int OperandToKeep = LII.SetCR ? 1 : 0;
+  for (int i = MI.getNumOperands() - 1; i > OperandToKeep; i--)
+    MI.RemoveOperand(i);
+
+  // Replace the instruction.
+  if (LII.SetCR) {
+    MI.setDesc(get(LII.Is64Bit ? PPC::ANDIo8 : PPC::ANDIo));
+    // Set the immediate.
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(LII.Imm).addReg(PPC::CR0, RegState::ImplicitDefine);
+    return;
+  }
+  else
+    MI.setDesc(get(LII.Is64Bit ? PPC::LI8 : PPC::LI));
+
+  // Set the immediate.
+  MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(LII.Imm);
+}
+
+MachineInstr *PPCInstrInfo::getConstantDefMI(MachineInstr &MI,
+                                             unsigned &ConstOp,
+                                             bool &SeenIntermediateUse) const {
+  ConstOp = ~0U;
+  MachineInstr *DefMI = nullptr;
+  MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
+  // If we'ere in SSA, get the defs through the MRI. Otherwise, only look
+  // within the basic block to see if the register is defined using an LI/LI8.
+  if (MRI->isSSA()) {
+    for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
+      if (!MI.getOperand(i).isReg())
+        continue;
+      unsigned Reg = MI.getOperand(i).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      unsigned TrueReg = lookThruCopyLike(Reg, MRI);
+      if (TargetRegisterInfo::isVirtualRegister(TrueReg)) {
+        DefMI = MRI->getVRegDef(TrueReg);
+        if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
+          ConstOp = i;
+          break;
+        }
+      }
+    }
+  } else {
+    // Looking back through the definition for each operand could be expensive,
+    // so exit early if this isn't an instruction that either has an immediate
+    // form or is already an immediate form that we can handle.
+    ImmInstrInfo III;
+    unsigned Opc = MI.getOpcode();
+    bool ConvertibleImmForm =
+      Opc == PPC::CMPWI || Opc == PPC::CMPLWI ||
+      Opc == PPC::CMPDI || Opc == PPC::CMPLDI ||
+      Opc == PPC::ADDI || Opc == PPC::ADDI8 ||
+      Opc == PPC::ORI || Opc == PPC::ORI8 ||
+      Opc == PPC::XORI || Opc == PPC::XORI8 ||
+      Opc == PPC::RLDICL || Opc == PPC::RLDICLo ||
+      Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
+      Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
+      Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
+    if (!instrHasImmForm(MI, III) && !ConvertibleImmForm)
+      return nullptr;
+
+    // Don't convert or %X, %Y, %Y since that's just a register move.
+    if ((Opc == PPC::OR || Opc == PPC::OR8) &&
+        MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+      return nullptr;
+    for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
+      MachineOperand &MO = MI.getOperand(i);
+      SeenIntermediateUse = false;
+      if (MO.isReg() && MO.isUse() && !MO.isImplicit()) {
+        MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI;
+        It++;
+        unsigned Reg = MI.getOperand(i).getReg();
+        // MachineInstr::readsRegister only returns true if the machine
+        // instruction reads the exact register or its super-register. It
+        // does not consider uses of sub-registers which seems like strange
+        // behaviour. Nonetheless, if we end up with a 64-bit register here,
+        // get the corresponding 32-bit register to check.
+        if (PPC::G8RCRegClass.contains(Reg))
+          Reg = Reg - PPC::X0 + PPC::R0;
+
+        // Is this register defined by a load-immediate in this block?
+        for ( ; It != E; ++It) {
+          if (It->modifiesRegister(Reg, &getRegisterInfo())) {
+            if (It->getOpcode() == PPC::LI || It->getOpcode() == PPC::LI8) {
+              ConstOp = i;
+              return &*It;
+            } else
+              break;
+          } else if (It->readsRegister(Reg, &getRegisterInfo()))
+            // If we see another use of this reg between the def and the MI,
+            // we want to flat it so the def isn't deleted.
+            SeenIntermediateUse = true;
+        }
+      }
+    }
+  }
+  return ConstOp == ~0U ? nullptr : DefMI;
+}
+
+// If this instruction has an immediate form and one of its operands is a
+// result of a load-immediate, convert it to the immediate form if the constant
+// is in range.
+bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
+                                          MachineInstr **KilledDef) const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  bool PostRA = !MRI->isSSA();
+  bool SeenIntermediateUse = true;
+  unsigned ConstantOperand = ~0U;
+  MachineInstr *DefMI = getConstantDefMI(MI, ConstantOperand,
+                                         SeenIntermediateUse);
+  if (!DefMI || !DefMI->getOperand(1).isImm())
+    return false;
+  assert(ConstantOperand < MI.getNumOperands() &&
+         "The constant operand needs to be valid at this point");
+
+  int64_t Immediate = DefMI->getOperand(1).getImm();
+  // Sign-extend to 64-bits.
+  int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
+    (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;
+
+  if (KilledDef && MI.getOperand(ConstantOperand).isKill() &&
+      !SeenIntermediateUse)
+    *KilledDef = DefMI;
+
+  // If this is a reg+reg instruction that has a reg+imm form, convert it now.
+  ImmInstrInfo III;
+  if (instrHasImmForm(MI, III))
+    return transformToImmForm(MI, III, ConstantOperand, SExtImm);
+
+  bool ReplaceWithLI = false;
+  bool Is64BitLI = false;
+  int64_t NewImm = 0;
+  bool SetCR = false;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default: return false;
+
+  // FIXME: Any branches conditional on such a comparison can be made
+  // unconditional. At this time, this happens too infrequently to be worth
+  // the implementation effort, but if that ever changes, we could convert
+  // such a pattern here.
+  case PPC::CMPWI:
+  case PPC::CMPLWI:
+  case PPC::CMPDI:
+  case PPC::CMPLDI: {
+    // Doing this post-RA would require dataflow analysis to reliably find uses
+    // of the CR register set by the compare.
+    if (PostRA)
+      return false;
+    // If a compare-immediate is fed by an immediate and is itself an input of
+    // an ISEL (the most common case) into a COPY of the correct register.
+    bool Changed = false;
+    unsigned DefReg = MI.getOperand(0).getReg();
+    int64_t Comparand = MI.getOperand(2).getImm();
+    int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
+      (Comparand | 0xFFFFFFFFFFFF0000) : Comparand;
+
+    for (auto &CompareUseMI : MRI->use_instructions(DefReg)) {
+      unsigned UseOpc = CompareUseMI.getOpcode();
+      if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
+        continue;
+      unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
+      unsigned TrueReg = CompareUseMI.getOperand(1).getReg();
+      unsigned FalseReg = CompareUseMI.getOperand(2).getReg();
+      unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
+                                     FalseReg, CRSubReg);
+      if (RegToCopy == PPC::NoRegister)
+        continue;
+      // Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0.
+      if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) {
+        CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
+        CompareUseMI.getOperand(1).ChangeToImmediate(0);
+        CompareUseMI.RemoveOperand(3);
+        CompareUseMI.RemoveOperand(2);
+        continue;
+      }
+      DEBUG(dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
+      DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
+      DEBUG(dbgs() << "Is converted to:\n");
+      // Convert to copy and remove unneeded operands.
+      CompareUseMI.setDesc(get(PPC::COPY));
+      CompareUseMI.RemoveOperand(3);
+      CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
+      CmpIselsConverted++;
+      Changed = true;
+      DEBUG(CompareUseMI.dump());
+    }
+    if (Changed)
+      return true;
+    // This may end up incremented multiple times since this function is called
+    // during a fixed-point transformation, but it is only meant to indicate the
+    // presence of this opportunity.
+    MissedConvertibleImmediateInstrs++;
+    return false;
+  }
+
+  // Immediate forms - may simply be convertable to an LI.
+  case PPC::ADDI:
+  case PPC::ADDI8: {
+    // Does the sum fit in a 16-bit signed field?
+    int64_t Addend = MI.getOperand(2).getImm();
+    if (isInt<16>(Addend + SExtImm)) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::ADDI8;
+      NewImm = Addend + SExtImm;
+      break;
+    }
+    return false;
+  }
+  case PPC::RLDICL:
+  case PPC::RLDICLo:
+  case PPC::RLDICL_32:
+  case PPC::RLDICL_32_64: {
+    // Use APInt's rotate function.
+    int64_t SH = MI.getOperand(2).getImm();
+    int64_t MB = MI.getOperand(3).getImm();
+    APInt InVal(Opc == PPC::RLDICL ? 64 : 32, SExtImm, true);
+    InVal = InVal.rotl(SH);
+    uint64_t Mask = (1LU << (63 - MB + 1)) - 1;
+    InVal &= Mask;
+    // Can't replace negative values with an LI as that will sign-extend
+    // and not clear the left bits. If we're setting the CR bit, we will use
+    // ANDIo which won't sign extend, so that's safe.
+    if (isUInt<15>(InVal.getSExtValue()) ||
+        (Opc == PPC::RLDICLo && isUInt<16>(InVal.getSExtValue()))) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc != PPC::RLDICL_32;
+      NewImm = InVal.getSExtValue();
+      SetCR = Opc == PPC::RLDICLo;
+      break;
+    }
+    return false;
+  }
+  case PPC::RLWINM:
+  case PPC::RLWINM8:
+  case PPC::RLWINMo:
+  case PPC::RLWINM8o: {
+    int64_t SH = MI.getOperand(2).getImm();
+    int64_t MB = MI.getOperand(3).getImm();
+    int64_t ME = MI.getOperand(4).getImm();
+    APInt InVal(32, SExtImm, true);
+    InVal = InVal.rotl(SH);
+    // Set the bits (       MB + 32      ) to (       ME + 32      ).
+    uint64_t Mask = ((1 << (32 - MB)) - 1) & ~((1 << (31 - ME)) - 1);
+    InVal &= Mask;
+    // Can't replace negative values with an LI as that will sign-extend
+    // and not clear the left bits. If we're setting the CR bit, we will use
+    // ANDIo which won't sign extend, so that's safe.
+    bool ValueFits = isUInt<15>(InVal.getSExtValue());
+    ValueFits |= ((Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o) &&
+                  isUInt<16>(InVal.getSExtValue()));
+    if (ValueFits) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
+      NewImm = InVal.getSExtValue();
+      SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o;
+      break;
+    }
+    return false;
+  }
+  case PPC::ORI:
+  case PPC::ORI8:
+  case PPC::XORI:
+  case PPC::XORI8: {
+    int64_t LogicalImm = MI.getOperand(2).getImm();
+    int64_t Result = 0;
+    if (Opc == PPC::ORI || Opc == PPC::ORI8)
+      Result = LogicalImm | SExtImm;
+    else
+      Result = LogicalImm ^ SExtImm;
+    if (isInt<16>(Result)) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::ORI8 || Opc == PPC::XORI8;
+      NewImm = Result;
+      break;
+    }
+    return false;
+  }
+  }
+
+  if (ReplaceWithLI) {
+    DEBUG(dbgs() << "Replacing instruction:\n");
+    DEBUG(MI.dump());
+    DEBUG(dbgs() << "Fed by:\n");
+    DEBUG(DefMI->dump());
+    LoadImmediateInfo LII;
+    LII.Imm = NewImm;
+    LII.Is64Bit = Is64BitLI;
+    LII.SetCR = SetCR;
+    // If we're setting the CR, the original load-immediate must be kept (as an
+    // operand to ANDIo/ANDI8o).
+    if (KilledDef && SetCR)
+      *KilledDef = nullptr;
+    replaceInstrWithLI(MI, LII);
+    DEBUG(dbgs() << "With:\n");
+    DEBUG(MI.dump());
+    return true;
+  }
+  return false;
+}
+
+bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
+                                   ImmInstrInfo &III) const {
+  unsigned Opc = MI.getOpcode();
+  // The vast majority of the instructions would need their operand 2 replaced
+  // with an immediate when switching to the reg+imm form. A marked exception
+  // are the update form loads/stores for which a constant operand 2 would need
+  // to turn into a displacement and move operand 1 to the operand 2 position.
+  III.ImmOpNo = 2;
+  III.ConstantOpNo = 2;
+  III.ImmWidth = 16;
+  III.ImmMustBeMultipleOf = 1;
+  switch (Opc) {
+  default: return false;
+  case PPC::ADD4:
+  case PPC::ADD8:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 1;
+    III.IsCommutative = true;
+    III.ImmOpcode = Opc == PPC::ADD4 ? PPC::ADDI : PPC::ADDI8;
+    break;
+  case PPC::ADDC:
+  case PPC::ADDC8:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = true;
+    III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8;
+    break;
+  case PPC::ADDCo:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = true;
+    III.ImmOpcode = PPC::ADDICo;
+    break;
+  case PPC::SUBFC:
+  case PPC::SUBFC8:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = false;
+    III.ImmOpcode = Opc == PPC::SUBFC ? PPC::SUBFIC : PPC::SUBFIC8;
+    break;
+  case PPC::CMPW:
+  case PPC::CMPD:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = false;
+    III.ImmOpcode = Opc == PPC::CMPW ? PPC::CMPWI : PPC::CMPDI;
+    break;
+  case PPC::CMPLW:
+  case PPC::CMPLD:
+    III.SignedImm = false;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = false;
+    III.ImmOpcode = Opc == PPC::CMPLW ? PPC::CMPLWI : PPC::CMPLDI;
+    break;
+  case PPC::ANDo:
+  case PPC::AND8o:
+  case PPC::OR:
+  case PPC::OR8:
+  case PPC::XOR:
+  case PPC::XOR8:
+    III.SignedImm = false;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = true;
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode");
+    case PPC::ANDo: III.ImmOpcode = PPC::ANDIo; break;
+    case PPC::AND8o: III.ImmOpcode = PPC::ANDIo8; break;
+    case PPC::OR: III.ImmOpcode = PPC::ORI; break;
+    case PPC::OR8: III.ImmOpcode = PPC::ORI8; break;
+    case PPC::XOR: III.ImmOpcode = PPC::XORI; break;
+    case PPC::XOR8: III.ImmOpcode = PPC::XORI8; break;
+    }
+    break;
+  case PPC::RLWNM:
+  case PPC::RLWNM8:
+  case PPC::RLWNMo:
+  case PPC::RLWNM8o:
+  case PPC::RLDCL:
+  case PPC::RLDCLo:
+  case PPC::RLDCR:
+  case PPC::RLDCRo:
+  case PPC::SLW:
+  case PPC::SLW8:
+  case PPC::SLWo:
+  case PPC::SLW8o:
+  case PPC::SRW:
+  case PPC::SRW8:
+  case PPC::SRWo:
+  case PPC::SRW8o:
+  case PPC::SRAW:
+  case PPC::SRAWo:
+  case PPC::SLD:
+  case PPC::SLDo:
+  case PPC::SRD:
+  case PPC::SRDo:
+  case PPC::SRAD:
+  case PPC::SRADo:
+    III.SignedImm = false;
+    III.ZeroIsSpecialOrig = 0;
+    III.ZeroIsSpecialNew = 0;
+    III.IsCommutative = false;
+    // This isn't actually true, but the instructions ignore any of the
+    // upper bits, so any immediate loaded with an LI is acceptable.
+    III.ImmWidth = 16;
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode");
+    case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break;
+    case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break;
+    case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break;
+    case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break;
+    case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break;
+    case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break;
+    case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break;
+    case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break;
+    case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break;
+    case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break;
+    case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break;
+    case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break;
+    case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break;
+    case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break;
+    case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break;
+    case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break;
+    case PPC::SRAW: III.ImmOpcode = PPC::SRAWI; break;
+    case PPC::SRAWo: III.ImmOpcode = PPC::SRAWIo; break;
+    case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break;
+    case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break;
+    case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break;
+    case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break;
+    case PPC::SRAD: III.ImmOpcode = PPC::SRADI; break;
+    case PPC::SRADo: III.ImmOpcode = PPC::SRADIo; break;
+    }
+    break;
+  // Loads and stores:
+  case PPC::LBZX:
+  case PPC::LBZX8:
+  case PPC::LHZX:
+  case PPC::LHZX8:
+  case PPC::LHAX:
+  case PPC::LHAX8:
+  case PPC::LWZX:
+  case PPC::LWZX8:
+  case PPC::LWAX:
+  case PPC::LDX:
+  case PPC::LFSX:
+  case PPC::LFDX:
+  case PPC::STBX:
+  case PPC::STBX8:
+  case PPC::STHX:
+  case PPC::STHX8:
+  case PPC::STWX:
+  case PPC::STWX8:
+  case PPC::STDX:
+  case PPC::STFSX:
+  case PPC::STFDX:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 1;
+    III.ZeroIsSpecialNew = 2;
+    III.IsCommutative = true;
+    III.ImmOpNo = 1;
+    III.ConstantOpNo = 2;
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode");
+    case PPC::LBZX: III.ImmOpcode = PPC::LBZ; break;
+    case PPC::LBZX8: III.ImmOpcode = PPC::LBZ8; break;
+    case PPC::LHZX: III.ImmOpcode = PPC::LHZ; break;
+    case PPC::LHZX8: III.ImmOpcode = PPC::LHZ8; break;
+    case PPC::LHAX: III.ImmOpcode = PPC::LHA; break;
+    case PPC::LHAX8: III.ImmOpcode = PPC::LHA8; break;
+    case PPC::LWZX: III.ImmOpcode = PPC::LWZ; break;
+    case PPC::LWZX8: III.ImmOpcode = PPC::LWZ8; break;
+    case PPC::LWAX:
+      III.ImmOpcode = PPC::LWA;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::LDX: III.ImmOpcode = PPC::LD; III.ImmMustBeMultipleOf = 4; break;
+    case PPC::LFSX: III.ImmOpcode = PPC::LFS; break;
+    case PPC::LFDX: III.ImmOpcode = PPC::LFD; break;
+    case PPC::STBX: III.ImmOpcode = PPC::STB; break;
+    case PPC::STBX8: III.ImmOpcode = PPC::STB8; break;
+    case PPC::STHX: III.ImmOpcode = PPC::STH; break;
+    case PPC::STHX8: III.ImmOpcode = PPC::STH8; break;
+    case PPC::STWX: III.ImmOpcode = PPC::STW; break;
+    case PPC::STWX8: III.ImmOpcode = PPC::STW8; break;
+    case PPC::STDX:
+      III.ImmOpcode = PPC::STD;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::STFSX: III.ImmOpcode = PPC::STFS; break;
+    case PPC::STFDX: III.ImmOpcode = PPC::STFD; break;
+    }
+    break;
+  case PPC::LBZUX:
+  case PPC::LBZUX8:
+  case PPC::LHZUX:
+  case PPC::LHZUX8:
+  case PPC::LHAUX:
+  case PPC::LHAUX8:
+  case PPC::LWZUX:
+  case PPC::LWZUX8:
+  case PPC::LDUX:
+  case PPC::LFSUX:
+  case PPC::LFDUX:
+  case PPC::STBUX:
+  case PPC::STBUX8:
+  case PPC::STHUX:
+  case PPC::STHUX8:
+  case PPC::STWUX:
+  case PPC::STWUX8:
+  case PPC::STDUX:
+  case PPC::STFSUX:
+  case PPC::STFDUX:
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 2;
+    III.ZeroIsSpecialNew = 3;
+    III.IsCommutative = false;
+    III.ImmOpNo = 2;
+    III.ConstantOpNo = 3;
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode");
+    case PPC::LBZUX: III.ImmOpcode = PPC::LBZU; break;
+    case PPC::LBZUX8: III.ImmOpcode = PPC::LBZU8; break;
+    case PPC::LHZUX: III.ImmOpcode = PPC::LHZU; break;
+    case PPC::LHZUX8: III.ImmOpcode = PPC::LHZU8; break;
+    case PPC::LHAUX: III.ImmOpcode = PPC::LHAU; break;
+    case PPC::LHAUX8: III.ImmOpcode = PPC::LHAU8; break;
+    case PPC::LWZUX: III.ImmOpcode = PPC::LWZU; break;
+    case PPC::LWZUX8: III.ImmOpcode = PPC::LWZU8; break;
+    case PPC::LDUX:
+      III.ImmOpcode = PPC::LDU;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::LFSUX: III.ImmOpcode = PPC::LFSU; break;
+    case PPC::LFDUX: III.ImmOpcode = PPC::LFDU; break;
+    case PPC::STBUX: III.ImmOpcode = PPC::STBU; break;
+    case PPC::STBUX8: III.ImmOpcode = PPC::STBU8; break;
+    case PPC::STHUX: III.ImmOpcode = PPC::STHU; break;
+    case PPC::STHUX8: III.ImmOpcode = PPC::STHU8; break;
+    case PPC::STWUX: III.ImmOpcode = PPC::STWU; break;
+    case PPC::STWUX8: III.ImmOpcode = PPC::STWU8; break;
+    case PPC::STDUX:
+      III.ImmOpcode = PPC::STDU;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::STFSUX: III.ImmOpcode = PPC::STFSU; break;
+    case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break;
+    }
+    break;
+  // Power9 only.
+  case PPC::LXVX:
+  case PPC::LXSSPX:
+  case PPC::LXSDX:
+  case PPC::STXVX:
+  case PPC::STXSSPX:
+  case PPC::STXSDX:
+    if (!Subtarget.hasP9Vector())
+      return false;
+    III.SignedImm = true;
+    III.ZeroIsSpecialOrig = 1;
+    III.ZeroIsSpecialNew = 2;
+    III.IsCommutative = true;
+    III.ImmOpNo = 1;
+    III.ConstantOpNo = 2;
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode");
+    case PPC::LXVX:
+      III.ImmOpcode = PPC::LXV;
+      III.ImmMustBeMultipleOf = 16;
+      break;
+    case PPC::LXSSPX:
+      III.ImmOpcode = PPC::LXSSP;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::LXSDX:
+      III.ImmOpcode = PPC::LXSD;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::STXVX:
+      III.ImmOpcode = PPC::STXV;
+      III.ImmMustBeMultipleOf = 16;
+      break;
+    case PPC::STXSSPX:
+      III.ImmOpcode = PPC::STXSSP;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    case PPC::STXSDX:
+      III.ImmOpcode = PPC::STXSD;
+      III.ImmMustBeMultipleOf = 4;
+      break;
+    }
+    break;
+  }
+  return true;
+}
+
+// Utility function for swaping two arbitrary operands of an instruction.
+static void swapMIOperands(MachineInstr &MI, unsigned Op1, unsigned Op2) {
+  assert(Op1 != Op2 && "Cannot swap operand with itself.");
+
+  unsigned MaxOp = std::max(Op1, Op2);
+  unsigned MinOp = std::min(Op1, Op2);
+  MachineOperand MOp1 = MI.getOperand(MinOp);
+  MachineOperand MOp2 = MI.getOperand(MaxOp);
+  MI.RemoveOperand(std::max(Op1, Op2));
+  MI.RemoveOperand(std::min(Op1, Op2));
+
+  // If the operands we are swapping are the two at the end (the common case)
+  // we can just remove both and add them in the opposite order.
+  if (MaxOp - MinOp == 1 && MI.getNumOperands() == MinOp) {
+    MI.addOperand(MOp2);
+    MI.addOperand(MOp1);
+  } else {
+    // Store all operands in a temporary vector, remove them and re-add in the
+    // right order.
+    SmallVector<MachineOperand, 2> MOps;
+    unsigned TotalOps = MI.getNumOperands() + 2; // We've already removed 2 ops.
+    for (unsigned i = MI.getNumOperands() - 1; i >= MinOp; i--) {
+      MOps.push_back(MI.getOperand(i));
+      MI.RemoveOperand(i);
+    }
+    // MOp2 needs to be added next.
+    MI.addOperand(MOp2);
+    // Now add the rest.
+    for (unsigned i = MI.getNumOperands(); i < TotalOps; i++) {
+      if (i == MaxOp)
+        MI.addOperand(MOp1);
+      else {
+        MI.addOperand(MOps.back());
+        MOps.pop_back();
+      }
+    }
+  }
+}
+
+bool PPCInstrInfo::transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
+                                      unsigned ConstantOpNo,
+                                      int64_t Imm) const {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  bool PostRA = !MRI.isSSA();
+  // Exit early if we can't convert this.
+  if ((ConstantOpNo != III.ConstantOpNo) && !III.IsCommutative)
+    return false;
+  if (Imm % III.ImmMustBeMultipleOf)
+    return false;
+  if (III.SignedImm) {
+    APInt ActualValue(64, Imm, true);
+    if (!ActualValue.isSignedIntN(III.ImmWidth))
+      return false;
+  } else {
+    uint64_t UnsignedMax = (1 << III.ImmWidth) - 1;
+    if ((uint64_t)Imm > UnsignedMax)
+      return false;
+  }
+
+  // If we're post-RA, the instructions don't agree on whether register zero is
+  // special, we can transform this as long as the register operand that will
+  // end up in the location where zero is special isn't R0.
+  if (PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
+    unsigned PosForOrigZero = III.ZeroIsSpecialOrig ? III.ZeroIsSpecialOrig :
+      III.ZeroIsSpecialNew + 1;
+    unsigned OrigZeroReg = MI.getOperand(PosForOrigZero).getReg();
+    unsigned NewZeroReg = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+    // If R0 is in the operand where zero is special for the new instruction,
+    // it is unsafe to transform if the constant operand isn't that operand.
+    if ((NewZeroReg == PPC::R0 || NewZeroReg == PPC::X0) &&
+        ConstantOpNo != III.ZeroIsSpecialNew)
+      return false;
+    if ((OrigZeroReg == PPC::R0 || OrigZeroReg == PPC::X0) &&
+        ConstantOpNo != PosForOrigZero)
+      return false;
+  }
+
+  unsigned Opc = MI.getOpcode();
+  bool SpecialShift32 =
+    Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo;
+  bool SpecialShift64 =
+    Opc == PPC::SLD || Opc == PPC::SLDo || Opc == PPC::SRD || Opc == PPC::SRDo;
+  bool SetCR = Opc == PPC::SLWo || Opc == PPC::SRWo ||
+    Opc == PPC::SLDo || Opc == PPC::SRDo;
+  bool RightShift =
+    Opc == PPC::SRW || Opc == PPC::SRWo || Opc == PPC::SRD || Opc == PPC::SRDo;
+
+  MI.setDesc(get(III.ImmOpcode));
+  if (ConstantOpNo == III.ConstantOpNo) {
+    // Converting shifts to immediate form is a bit tricky since they may do
+    // one of three things:
+    // 1. If the shift amount is between OpSize and 2*OpSize, the result is zero
+    // 2. If the shift amount is zero, the result is unchanged (save for maybe
+    //    setting CR0)
+    // 3. If the shift amount is in [1, OpSize), it's just a shift
+    if (SpecialShift32 || SpecialShift64) {
+      LoadImmediateInfo LII;
+      LII.Imm = 0;
+      LII.SetCR = SetCR;
+      LII.Is64Bit = SpecialShift64;
+      uint64_t ShAmt = Imm & (SpecialShift32 ? 0x1F : 0x3F);
+      if (Imm & (SpecialShift32 ? 0x20 : 0x40))
+        replaceInstrWithLI(MI, LII);
+      // Shifts by zero don't change the value. If we don't need to set CR0,
+      // just convert this to a COPY. Can't do this post-RA since we've already
+      // cleaned up the copies.
+      else if (!SetCR && ShAmt == 0 && !PostRA) {
+        MI.RemoveOperand(2);
+        MI.setDesc(get(PPC::COPY));
+      } else {
+        // The 32 bit and 64 bit instructions are quite different.
+        if (SpecialShift32) {
+          // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31).
+          uint64_t SH = RightShift ? 32 - ShAmt : ShAmt;
+          uint64_t MB = RightShift ? ShAmt : 0;
+          uint64_t ME = RightShift ? 31 : 31 - ShAmt;
+          MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+          MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB)
+            .addImm(ME);
+        } else {
+          // Left shifts use (N, 63-N), right shifts use (64-N, N).
+          uint64_t SH = RightShift ? 64 - ShAmt : ShAmt;
+          uint64_t ME = RightShift ? ShAmt : 63 - ShAmt;
+          MI.getOperand(III.ConstantOpNo).ChangeToImmediate(SH);
+          MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME);
+        }
+      }
+    } else
+      MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm);
+  }
+  // Convert commutative instructions (switch the operands and convert the
+  // desired one to an immediate.
+  else if (III.IsCommutative) {
+    MI.getOperand(ConstantOpNo).ChangeToImmediate(Imm);
+    swapMIOperands(MI, ConstantOpNo, III.ConstantOpNo);
+  } else
+    llvm_unreachable("Should have exited early!");
+
+  // For instructions for which the constant register replaces a different
+  // operand than where the immediate goes, we need to swap them.
+  if (III.ConstantOpNo != III.ImmOpNo)
+    swapMIOperands(MI, III.ConstantOpNo, III.ImmOpNo);
+
+  // If the R0/X0 register is special for the original instruction and not for
+  // the new instruction (or vice versa), we need to fix up the register class.
+  if (!PostRA && III.ZeroIsSpecialOrig != III.ZeroIsSpecialNew) {
+    if (!III.ZeroIsSpecialOrig) {
+      unsigned RegToModify = MI.getOperand(III.ZeroIsSpecialNew).getReg();
+      const TargetRegisterClass *NewRC =
+        MRI.getRegClass(RegToModify)->hasSuperClassEq(&PPC::GPRCRegClass) ?
+        &PPC::GPRC_and_GPRC_NOR0RegClass : &PPC::G8RC_and_G8RC_NOX0RegClass;
+      MRI.setRegClass(RegToModify, NewRC);
+    }
+  }
+  return true;
+}
+
 const TargetRegisterClass *
 PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
   if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
@@ -2012,3 +2984,290 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
 int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) {
   return PPC::getRecordFormOpcode(Opcode);
 }
+
+// This function returns true if the machine instruction
+// always outputs a value by sign-extending a 32 bit value,
+// i.e. 0 to 31-th bits are same as 32-th bit.
+static bool isSignExtendingOp(const MachineInstr &MI) {
+  int Opcode = MI.getOpcode();
+  if (Opcode == PPC::LI     || Opcode == PPC::LI8     ||
+      Opcode == PPC::LIS    || Opcode == PPC::LIS8    ||
+      Opcode == PPC::SRAW   || Opcode == PPC::SRAWo   ||
+      Opcode == PPC::SRAWI  || Opcode == PPC::SRAWIo  ||
+      Opcode == PPC::LWA    || Opcode == PPC::LWAX    ||
+      Opcode == PPC::LWA_32 || Opcode == PPC::LWAX_32 ||
+      Opcode == PPC::LHA    || Opcode == PPC::LHAX    ||
+      Opcode == PPC::LHA8   || Opcode == PPC::LHAX8   ||
+      Opcode == PPC::LBZ    || Opcode == PPC::LBZX    ||
+      Opcode == PPC::LBZ8   || Opcode == PPC::LBZX8   ||
+      Opcode == PPC::LBZU   || Opcode == PPC::LBZUX   ||
+      Opcode == PPC::LBZU8  || Opcode == PPC::LBZUX8  ||
+      Opcode == PPC::LHZ    || Opcode == PPC::LHZX    ||
+      Opcode == PPC::LHZ8   || Opcode == PPC::LHZX8   ||
+      Opcode == PPC::LHZU   || Opcode == PPC::LHZUX   ||
+      Opcode == PPC::LHZU8  || Opcode == PPC::LHZUX8  ||
+      Opcode == PPC::EXTSB  || Opcode == PPC::EXTSBo  ||
+      Opcode == PPC::EXTSH  || Opcode == PPC::EXTSHo  ||
+      Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8  ||
+      Opcode == PPC::EXTSW  || Opcode == PPC::EXTSWo  ||
+      Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 ||
+      Opcode == PPC::EXTSB8_32_64)
+    return true;
+
+  if (Opcode == PPC::RLDICL && MI.getOperand(3).getImm() >= 33)
+    return true;
+
+  if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo ||
+       Opcode == PPC::RLWNM  || Opcode == PPC::RLWNMo) &&
+      MI.getOperand(3).getImm() > 0 &&
+      MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
+    return true;
+
+  return false;
+}
+
+// This function returns true if the machine instruction
+// always outputs zeros in higher 32 bits.
+static bool isZeroExtendingOp(const MachineInstr &MI) {
+  int Opcode = MI.getOpcode();
+  // The 16-bit immediate is sign-extended in li/lis.
+  // If the most significant bit is zero, all higher bits are zero.
+  if (Opcode == PPC::LI  || Opcode == PPC::LI8 ||
+      Opcode == PPC::LIS || Opcode == PPC::LIS8) {
+    int64_t Imm = MI.getOperand(1).getImm();
+    if (((uint64_t)Imm & ~0x7FFFuLL) == 0)
+      return true;
+  }
+
+  // We have some variations of rotate-and-mask instructions
+  // that clear higher 32-bits.
+  if ((Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo ||
+       Opcode == PPC::RLDCL  || Opcode == PPC::RLDCLo  ||
+       Opcode == PPC::RLDICL_32_64) &&
+      MI.getOperand(3).getImm() >= 32)
+    return true;
+
+  if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) &&
+      MI.getOperand(3).getImm() >= 32 &&
+      MI.getOperand(3).getImm() <= 63 - MI.getOperand(2).getImm())
+    return true;
+
+  if ((Opcode == PPC::RLWINM  || Opcode == PPC::RLWINMo ||
+       Opcode == PPC::RLWNM   || Opcode == PPC::RLWNMo  ||
+       Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) &&
+      MI.getOperand(3).getImm() <= MI.getOperand(4).getImm())
+    return true;
+
+  // There are other instructions that clear higher 32-bits.
+  if (Opcode == PPC::CNTLZW  || Opcode == PPC::CNTLZWo ||
+      Opcode == PPC::CNTTZW  || Opcode == PPC::CNTTZWo ||
+      Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8 ||
+      Opcode == PPC::CNTLZD  || Opcode == PPC::CNTLZDo ||
+      Opcode == PPC::CNTTZD  || Opcode == PPC::CNTTZDo ||
+      Opcode == PPC::POPCNTD || Opcode == PPC::POPCNTW ||
+      Opcode == PPC::SLW     || Opcode == PPC::SLWo    ||
+      Opcode == PPC::SRW     || Opcode == PPC::SRWo    ||
+      Opcode == PPC::SLW8    || Opcode == PPC::SRW8    ||
+      Opcode == PPC::SLWI    || Opcode == PPC::SLWIo   ||
+      Opcode == PPC::SRWI    || Opcode == PPC::SRWIo   ||
+      Opcode == PPC::LWZ     || Opcode == PPC::LWZX    ||
+      Opcode == PPC::LWZU    || Opcode == PPC::LWZUX   ||
+      Opcode == PPC::LWBRX   || Opcode == PPC::LHBRX   ||
+      Opcode == PPC::LHZ     || Opcode == PPC::LHZX    ||
+      Opcode == PPC::LHZU    || Opcode == PPC::LHZUX   ||
+      Opcode == PPC::LBZ     || Opcode == PPC::LBZX    ||
+      Opcode == PPC::LBZU    || Opcode == PPC::LBZUX   ||
+      Opcode == PPC::LWZ8    || Opcode == PPC::LWZX8   ||
+      Opcode == PPC::LWZU8   || Opcode == PPC::LWZUX8  ||
+      Opcode == PPC::LWBRX8  || Opcode == PPC::LHBRX8  ||
+      Opcode == PPC::LHZ8    || Opcode == PPC::LHZX8   ||
+      Opcode == PPC::LHZU8   || Opcode == PPC::LHZUX8  ||
+      Opcode == PPC::LBZ8    || Opcode == PPC::LBZX8   ||
+      Opcode == PPC::LBZU8   || Opcode == PPC::LBZUX8  ||
+      Opcode == PPC::ANDIo   || Opcode == PPC::ANDISo  ||
+      Opcode == PPC::ROTRWI  || Opcode == PPC::ROTRWIo ||
+      Opcode == PPC::EXTLWI  || Opcode == PPC::EXTLWIo ||
+      Opcode == PPC::MFVSRWZ)
+    return true;
+
+  return false;
+}
+
+// This function returns true if the input MachineInstr is a TOC save
+// instruction.
+bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const {
+  if (!MI.getOperand(1).isImm() || !MI.getOperand(2).isReg())
+    return false;
+  unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+  unsigned StackOffset = MI.getOperand(1).getImm();
+  unsigned StackReg = MI.getOperand(2).getReg();
+  if (StackReg == PPC::X1 && StackOffset == TOCSaveOffset)
+    return true;
+
+  return false;
+}
+
+// We limit the max depth to track incoming values of PHIs or binary ops
+// (e.g. AND) to avoid exsessive cost.
+const unsigned MAX_DEPTH = 1;
+
+bool
+PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
+                                   const unsigned Depth) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  // If we know this instruction returns sign- or zero-extended result,
+  // return true.
+  if (SignExt ? isSignExtendingOp(MI):
+                isZeroExtendingOp(MI))
+    return true;
+
+  switch (MI.getOpcode()) {
+  case PPC::COPY: {
+    unsigned SrcReg = MI.getOperand(1).getReg();
+
+    // In both ELFv1 and v2 ABI, method parameters and the return value
+    // are sign- or zero-extended.
+    if (MF->getSubtarget<PPCSubtarget>().isSVR4ABI()) {
+      const PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      // We check the ZExt/SExt flags for a method parameter.
+      if (MI.getParent()->getBasicBlock() ==
+          &MF->getFunction().getEntryBlock()) {
+        unsigned VReg = MI.getOperand(0).getReg();
+        if (MF->getRegInfo().isLiveIn(VReg))
+          return SignExt ? FuncInfo->isLiveInSExt(VReg) :
+                           FuncInfo->isLiveInZExt(VReg);
+      }
+
+      // For a method return value, we check the ZExt/SExt flags in attribute.
+      // We assume the following code sequence for method call.
+      //   ADJCALLSTACKDOWN 32, implicit dead %r1, implicit %r1
+      //   BL8_NOP @func,...
+      //   ADJCALLSTACKUP 32, 0, implicit dead %r1, implicit %r1
+      //   %5 = COPY %x3; G8RC:%5
+      if (SrcReg == PPC::X3) {
+        const MachineBasicBlock *MBB = MI.getParent();
+        MachineBasicBlock::const_instr_iterator II =
+          MachineBasicBlock::const_instr_iterator(&MI);
+        if (II != MBB->instr_begin() &&
+            (--II)->getOpcode() == PPC::ADJCALLSTACKUP) {
+          const MachineInstr &CallMI = *(--II);
+          if (CallMI.isCall() && CallMI.getOperand(0).isGlobal()) {
+            const Function *CalleeFn =
+              dyn_cast<Function>(CallMI.getOperand(0).getGlobal());
+            if (!CalleeFn)
+              return false;
+            const IntegerType *IntTy =
+              dyn_cast<IntegerType>(CalleeFn->getReturnType());
+            const AttributeSet &Attrs =
+              CalleeFn->getAttributes().getRetAttributes();
+            if (IntTy && IntTy->getBitWidth() <= 32)
+              return Attrs.hasAttribute(SignExt ? Attribute::SExt :
+                                                  Attribute::ZExt);
+          }
+        }
+      }
+    }
+
+    // If this is a copy from another register, we recursively check source.
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return false;
+    const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+    if (SrcMI != NULL)
+      return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
+
+    return false;
+  }
+
+  case PPC::ANDIo:
+  case PPC::ANDISo:
+  case PPC::ORI:
+  case PPC::ORIS:
+  case PPC::XORI:
+  case PPC::XORIS:
+  case PPC::ANDIo8:
+  case PPC::ANDISo8:
+  case PPC::ORI8:
+  case PPC::ORIS8:
+  case PPC::XORI8:
+  case PPC::XORIS8: {
+    // logical operation with 16-bit immediate does not change the upper bits.
+    // So, we track the operand register as we do for register copy.
+    unsigned SrcReg = MI.getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return false;
+    const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+    if (SrcMI != NULL)
+      return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
+
+    return false;
+  }
+
+  // If all incoming values are sign-/zero-extended,
+  // the output of OR, ISEL or PHI is also sign-/zero-extended.
+  case PPC::OR:
+  case PPC::OR8:
+  case PPC::ISEL:
+  case PPC::PHI: {
+    if (Depth >= MAX_DEPTH)
+      return false;
+
+    // The input registers for PHI are operand 1, 3, ...
+    // The input registers for others are operand 1 and 2.
+    unsigned E = 3, D = 1;
+    if (MI.getOpcode() == PPC::PHI) {
+      E = MI.getNumOperands();
+      D = 2;
+    }
+
+    for (unsigned I = 1; I != E; I += D) {
+      if (MI.getOperand(I).isReg()) {
+        unsigned SrcReg = MI.getOperand(I).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+          return false;
+        const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+        if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
+          return false;
+      }
+      else
+        return false;
+    }
+    return true;
+  }
+
+  // If at least one of the incoming values of an AND is zero extended
+  // then the output is also zero-extended. If both of the incoming values
+  // are sign-extended then the output is also sign extended.
+  case PPC::AND:
+  case PPC::AND8: {
+    if (Depth >= MAX_DEPTH)
+       return false;
+
+    assert(MI.getOperand(1).isReg() && MI.getOperand(2).isReg());
+
+    unsigned SrcReg1 = MI.getOperand(1).getReg();
+    unsigned SrcReg2 = MI.getOperand(2).getReg();
+
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg1) ||
+        !TargetRegisterInfo::isVirtualRegister(SrcReg2))
+       return false;
+
+    const MachineInstr *MISrc1 = MRI->getVRegDef(SrcReg1);
+    const MachineInstr *MISrc2 = MRI->getVRegDef(SrcReg2);
+    if (!MISrc1 || !MISrc2)
+        return false;
+
+    if(SignExt)
+        return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) &&
+               isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
+    else
+        return isSignOrZeroExtended(*MISrc1, SignExt, Depth+1) ||
+               isSignOrZeroExtended(*MISrc2, SignExt, Depth+1);
+  }
+
+  default:
+    break;
+  }
+  return false;
+}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index b0629c88cf57b..4271c50127a1d 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -16,7 +16,7 @@
 
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "PPCGenInstrInfo.inc"
@@ -72,6 +72,41 @@ enum {
 };
 } // end namespace PPCII
 
+// Instructions that have an immediate form might be convertible to that
+// form if the correct input is a result of a load immediate. In order to
+// know whether the transformation is special, we might need to know some
+// of the details of the two forms.
+struct ImmInstrInfo {
+  // Is the immediate field in the immediate form signed or unsigned?
+  uint64_t SignedImm : 1;
+  // Does the immediate need to be a multiple of some value?
+  uint64_t ImmMustBeMultipleOf : 5;
+  // Is R0/X0 treated specially by the original r+r instruction?
+  // If so, in which operand?
+  uint64_t ZeroIsSpecialOrig : 3;
+  // Is R0/X0 treated specially by the new r+i instruction?
+  // If so, in which operand?
+  uint64_t ZeroIsSpecialNew : 3;
+  // Is the operation commutative?
+  uint64_t IsCommutative : 1;
+  // The operand number to check for load immediate.
+  uint64_t ConstantOpNo : 3;
+  // The operand number for the immediate.
+  uint64_t ImmOpNo : 3;
+  // The opcode of the new instruction.
+  uint64_t ImmOpcode : 16;
+  // The size of the immediate.
+  uint64_t ImmWidth : 5;
+};
+
+// Information required to convert an instruction to just a materialized
+// immediate.
+struct LoadImmediateInfo {
+  unsigned Imm : 16;
+  unsigned Is64Bit : 1;
+  unsigned SetCR : 1;
+};
+
 class PPCSubtarget;
 class PPCInstrInfo : public PPCGenInstrInfo {
   PPCSubtarget &Subtarget;
@@ -87,6 +122,10 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr *> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
+  bool transformToImmForm(MachineInstr &MI, const ImmInstrInfo &III,
+                          unsigned ConstantOpNo, int64_t Imm) const;
+  MachineInstr *getConstantDefMI(MachineInstr &MI, unsigned &ConstOp,
+                                 bool &SeenIntermediateUse) const;
   virtual void anchor();
 
 protected:
@@ -282,6 +321,9 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  // Expand VSX Memory Pseudo instruction to either a VSX or a FP instruction.
+  bool expandVSXMemPseudo(MachineInstr &MI) const;
+
   // Lower pseudo instructions after register allocation.
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
@@ -293,6 +335,36 @@ public:
   }
   const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
   static int getRecordFormOpcode(unsigned Opcode);
+
+  bool isTOCSaveMI(const MachineInstr &MI) const;
+
+  bool isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
+                            const unsigned PhiDepth) const;
+
+  /// Return true if the output of the instruction is always a sign-extended,
+  /// i.e. 0 to 31-th bits are same as 32-th bit.
+  bool isSignExtended(const MachineInstr &MI, const unsigned depth = 0) const {
+    return isSignOrZeroExtended(MI, true, depth);
+  }
+
+  /// Return true if the output of the instruction is always zero-extended,
+  /// i.e. 0 to 31-th bits are all zeros
+  bool isZeroExtended(const MachineInstr &MI, const unsigned depth = 0) const {
+   return isSignOrZeroExtended(MI, false, depth);
+  }
+
+  bool convertToImmediateForm(MachineInstr &MI,
+                              MachineInstr **KilledDef = nullptr) const;
+  void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
+
+  // This is used to find the "true" source register for n
+  // Machine instruction. Returns the original SrcReg unless it is the target
+  // of a copy-like operation, in which case we chain backwards through all
+  // such operations to the ultimate source register.  If a
+  // physical register is encountered, we stop the search.
+  static unsigned lookThruCopyLike(unsigned SrcReg,
+                                   const MachineRegisterInfo *MRI);
+  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index dd7fc2659102a..a932d05b24eef 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -181,7 +181,7 @@ def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
-def PPCxxinsert  : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
 def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>;
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@@ -1057,6 +1057,20 @@ multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
   }
 }
 
+multiclass XSForm_1r<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
 multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
                     string asmbase, string asmstr, InstrItinClass itin,
                     list<dag> pattern> {
@@ -1576,6 +1590,11 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
           (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
 
 // Atomic operations
+// FIXME: some of these might be used with constant operands. This will result
+// in constant materialization instructions that may be redundant. We currently
+// clean this up in PPCMIPeephole with calls to
+// PPCInstrInfo::convertToImmediateForm() but we should probably not emit them
+// in the first place.
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I8 : Pseudo<
@@ -2571,6 +2590,35 @@ let Uses = [RM] in {
   let Defs = [CR1] in
   def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
                       "mffs. $rT", IIC_IntMFFS, []>, isDOT;
+
+  def MFFSCE : X_FRT5_XO2_XO3_XO10<63, 0, 1, 583, (outs f8rc:$rT), (ins),
+                                  "mffsce $rT", IIC_IntMFFS, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  def MFFSCDRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 4, 583, (outs f8rc:$rT),
+                                         (ins f8rc:$FRB), "mffscdrn $rT, $FRB",
+                                         IIC_IntMFFS, []>,
+                 PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  def MFFSCDRNI : X_FRT5_XO2_XO3_DRM3_XO10<63, 2, 5, 583, (outs f8rc:$rT),
+                                          (ins u3imm:$DRM),
+                                          "mffscdrni $rT, $DRM",
+                                          IIC_IntMFFS, []>,
+                  PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  def MFFSCRN : X_FRT5_XO2_XO3_FRB5_XO10<63, 2, 6, 583, (outs f8rc:$rT),
+                                        (ins f8rc:$FRB), "mffscrn $rT, $FRB",
+                                        IIC_IntMFFS, []>,
+                PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  def MFFSCRNI : X_FRT5_XO2_XO3_RM2_X10<63, 2, 7, 583, (outs f8rc:$rT),
+                                       (ins u2imm:$RM), "mffscrni $rT, $RM",
+                                       IIC_IntMFFS, []>,
+                 PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  def MFFSL  : X_FRT5_XO2_XO3_XO10<63, 3, 0, 583, (outs f8rc:$rT), (ins),
+                                  "mffsl $rT", IIC_IntMFFS, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
 
 let Predicates = [IsISA3_0] in {
@@ -3890,6 +3938,63 @@ def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
 def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
                              "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
 
+// External PID Load Store Instructions
+
+def LBEPX   : XForm_1<31, 95, (outs gprc:$rD), (ins memrr:$src),
+                      "lbepx $rD, $src", IIC_LdStLoad, []>,
+                      Requires<[IsE500]>;
+
+def LFDEPX  : XForm_25<31, 607, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfdepx $frD, $src", IIC_LdStLFD, []>,
+                      Requires<[IsE500]>;
+
+def LHEPX   : XForm_1<31, 287, (outs gprc:$rD), (ins memrr:$src),
+                      "lhepx $rD, $src", IIC_LdStLoad, []>,
+                      Requires<[IsE500]>;
+
+def LWEPX   : XForm_1<31, 31, (outs gprc:$rD), (ins memrr:$src),
+                      "lwepx $rD, $src", IIC_LdStLoad, []>,
+                      Requires<[IsE500]>;
+
+def STBEPX  : XForm_8<31, 223, (outs), (ins gprc:$rS, memrr:$dst),
+                      "stbepx $rS, $dst", IIC_LdStStore, []>,
+                      Requires<[IsE500]>;
+
+def STFDEPX : XForm_28<31, 735, (outs), (ins f8rc:$frS, memrr:$dst),
+                      "stfdepx $frS, $dst", IIC_LdStSTFD, []>,
+                      Requires<[IsE500]>;
+
+def STHEPX  : XForm_8<31, 415, (outs), (ins gprc:$rS, memrr:$dst),
+                      "sthepx $rS, $dst", IIC_LdStStore, []>,
+                      Requires<[IsE500]>;
+
+def STWEPX  : XForm_8<31, 159, (outs), (ins gprc:$rS, memrr:$dst),
+                      "stwepx $rS, $dst", IIC_LdStStore, []>,
+                      Requires<[IsE500]>;
+
+def DCBFEP  : DCB_Form<127, 0, (outs), (ins memrr:$dst), "dcbfep $dst",
+                      IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBSTEP : DCB_Form<63, 0, (outs), (ins memrr:$dst), "dcbstep $dst",
+                      IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBTEP  : DCB_Form_hint<319, (outs), (ins memrr:$dst, u5imm:$TH),
+                      "dcbtep $TH, $dst", IIC_LdStDCBF, []>,
+                      Requires<[IsE500]>;
+
+def DCBTSTEP : DCB_Form_hint<255, (outs), (ins memrr:$dst, u5imm:$TH),
+                      "dcbtstep $TH, $dst", IIC_LdStDCBF, []>,
+                      Requires<[IsE500]>;
+
+def DCBZEP  : DCB_Form<1023, 0, (outs), (ins memrr:$dst), "dcbzep $dst",
+                      IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def DCBZLEP : DCB_Form<1023, 1, (outs), (ins memrr:$dst), "dcbzlep $dst",
+                      IIC_LdStDCBF, []>, Requires<[IsE500]>;
+
+def ICBIEP  : XForm_1a<31, 991, (outs), (ins memrr:$src), "icbiep $src",
+                      IIC_LdStICBI, []>, Requires<[IsE500]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -3908,6 +4013,7 @@ class PPCAsmPseudo<string asm, dag iops>
   let AsmString = asm;
   let isAsmParserOnly = 1;
   let isPseudo = 1;
+  let hasNoSchedulingInfo = 1;
 }
 
 def : InstAlias<"sc", (SC 0)>;
@@ -4208,6 +4314,7 @@ def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
                             (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
 def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
                              (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>;
 
 def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
 def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
@@ -4215,8 +4322,9 @@ def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
 def : InstAlias<"clrldi $rA, $rS, $n",
-                (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
+                (RLDICL_32_64 g8rc:$rA, gprc:$rS, 0, u6imm:$n)>;
 def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"lnia $RT", (ADDPCIS g8rc:$RT, 0)>;
 
 def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
                             (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
@@ -4233,7 +4341,7 @@ def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
 
 // These generic branch instruction forms are used for the assembler parser only.
 // Defs and Uses are conservative, since we don't know the BO value.
-let PPC970_Unit = 7 in {
+let PPC970_Unit = 7, isBranch = 1 in {
   let Defs = [CTR], Uses = [CTR, RM] in {
     def gBC : BForm_3<16, 0, 0, (outs),
                       (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
@@ -4550,7 +4658,7 @@ def : Pat<(i32 (bitreverse i32:$A)),
 // n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC);
 // Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
 // n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0);
-// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]):
+// Step 4: byte reverse (Suppose n = [B0,B1,B2,B3,B4,B5,B6,B7]):
 // Apply the same byte reverse algorithm mentioned above for the fast 32-bit
 // reverse to both the high 32 bit and low 32 bit of the 64 bit value. And
 // then OR them together to get the final result.
@@ -4572,92 +4680,55 @@ def DWMaskValues {
   dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0);
 }
 
-def DWShift1 {
-  dag Right = (RLDICL $A, 63, 1);
-  dag Left = (RLDICR $A, 1, 62);
-}
-
-def DWSwap1 {
-  dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1),
-                 (AND8 DWShift1.Left, DWMaskValues.Hi1));
-}
-
-def DWShift2 {
-  dag Right = (RLDICL DWSwap1.Bit, 62, 2);
-  dag Left = (RLDICR DWSwap1.Bit, 2, 61);
-}
-
-def DWSwap2 {
-  dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2),
-                  (AND8 DWShift2.Left, DWMaskValues.Hi2));
-}
-
-def DWShift4 {
-  dag Right = (RLDICL DWSwap2.Bits, 60, 4);
-  dag Left = (RLDICR DWSwap2.Bits, 4, 59);
-}
-
-def DWSwap4 {
-  dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4),
-                  (AND8 DWShift4.Left, DWMaskValues.Hi4));
-}
-
-// Bit swap is done, now start byte swap.
-def DWExtractLo32 {
-  dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32));
-}
-
-def DWRotateLo32 {
-  dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31);
-}
-
-def DWLo32RotateInsertByte3 {
-  dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15);
-}
-
-// Lower 32 bits in the right order
-def DWLo32RotateInsertByte1 {
-  dag Left =
-    (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31);
+def DWSwapInByte {
+  dag Swap1 = (OR8 (AND8 (RLDICL $A, 63, 1), DWMaskValues.Lo1),
+                   (AND8 (RLDICR $A, 1, 62), DWMaskValues.Hi1));
+  dag Swap2 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap1, 62, 2), DWMaskValues.Lo2),
+                   (AND8 (RLDICR DWSwapInByte.Swap1, 2, 61), DWMaskValues.Hi2));
+  dag Swap4 = (OR8 (AND8 (RLDICL DWSwapInByte.Swap2, 60, 4), DWMaskValues.Lo4),
+                   (AND8 (RLDICR DWSwapInByte.Swap2, 4, 59), DWMaskValues.Hi4));
 }
 
-def ExtendLo32 {
-  dag To64Bit =
-    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-          DWLo32RotateInsertByte1.Left, sub_32));
+// Intra-byte swap is done, now start inter-byte swap.
+def DWBytes4567 {
+  dag Word = (i32 (EXTRACT_SUBREG DWSwapInByte.Swap4, sub_32));
 }
 
-def DWShiftHi32 { // SRDI DWSwap4.Bits, 32)
-  dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32);
+def DWBytes7456 {
+  dag Word = (RLWINM DWBytes4567.Word, 24, 0, 31);
 }
 
-def DWExtractHi32 {
-  dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32));
+def DWBytes7656 {
+  dag Word = (RLWIMI DWBytes7456.Word, DWBytes4567.Word, 8, 8, 15);
 }
 
-def DWRotateHi32 {
-  dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31);
+// B7 B6 B5 B4 in the right order
+def DWBytes7654 {
+  dag Word = (RLWIMI DWBytes7656.Word, DWBytes4567.Word, 8, 24, 31);
+  dag DWord =
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes7654.Word, sub_32));
 }
 
-def DWHi32RotateInsertByte3 {
-  dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15);
+def DWBytes0123 {
+  dag Word = (i32 (EXTRACT_SUBREG (RLDICL DWSwapInByte.Swap4, 32, 32), sub_32));
 }
 
-// High 32 bits in the right order, but in the low 32-bit position
-def DWHi32RotateInsertByte1 {
-  dag Left =
-    (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31);
+def DWBytes3012 {
+  dag Word = (RLWINM DWBytes0123.Word, 24, 0, 31);
 }
 
-def ExtendHi32 {
-  dag To64Bit =
-    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-          DWHi32RotateInsertByte1.Left, sub_32));
+def DWBytes3212 {
+  dag Word = (RLWIMI DWBytes3012.Word, DWBytes0123.Word, 8, 8, 15);
 }
 
-def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32
-  dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31);
+// B3 B2 B1 B0 in the right order
+def DWBytes3210 {
+  dag Word = (RLWIMI DWBytes3212.Word, DWBytes0123.Word, 8, 24, 31);
+  dag DWord =
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), DWBytes3210.Word, sub_32));
 }
 
+// Now both high word and low word are reversed, next
+// swap the high word and low word.
 def : Pat<(i64 (bitreverse i64:$A)),
-  (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>;
+  (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 942e8b392b82b..6f719784eb7c6 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -47,6 +47,13 @@ def vssrc : RegisterOperand<VSSRC> {
   let ParserMatchClass = PPCRegVSSRCAsmOperand;
 }
 
+def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
+  let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
+}
+
+def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
+  let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
+}
 // Little-endian-specific nodes.
 def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
   SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@@ -124,6 +131,12 @@ let Uses = [RM] in {
                         "lxsdx $XT, $src", IIC_LdStLFD,
                         [(set f64:$XT, (load xoaddr:$src))]>;
 
+    // Pseudo instruction XFLOADf64 will be expanded to LXSDX or LFDX later
+    let isPseudo = 1, CodeSize = 3 in
+      def XFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+                              "#XFLOADf64",
+                              [(set f64:$XT, (load xoaddr:$src))]>;
+
     let Predicates = [HasVSX, HasOnlySwappingMemOps] in
     def LXVD2X : XX1Form<31, 844,
                          (outs vsrc:$XT), (ins memrr:$src),
@@ -149,6 +162,12 @@ let Uses = [RM] in {
                         "stxsdx $XT, $dst", IIC_LdStSTFD,
                         [(store f64:$XT, xoaddr:$dst)]>;
 
+    // Pseudo instruction XFSTOREf64  will be expanded to STXSDX or STFDX later
+    let isPseudo = 1, CodeSize = 3 in
+      def XFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+                              "#XFSTOREf64",
+                              [(store f64:$XT, xoaddr:$dst)]>;
+
     let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
     // The behaviour of this instruction is endianness-specific so we provide no
     // pattern to match it without considering endianness.
@@ -1208,32 +1227,59 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
-                         "lxsspx $XT, $src", IIC_LdStLFD,
-                         [(set f32:$XT, (load xoaddr:$src))]>;
+                         "lxsspx $XT, $src", IIC_LdStLFD, []>;
     def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
-                          "lxsiwax $XT, $src", IIC_LdStLFD,
-                          [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+                          "lxsiwax $XT, $src", IIC_LdStLFD, []>;
     def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
-                          "lxsiwzx $XT, $src", IIC_LdStLFD,
-                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+                          "lxsiwzx $XT, $src", IIC_LdStLFD, []>;
+
+    // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
+    // would cause these Pseudos are not expanded in expandPostRAPseudos()
+    let isPseudo = 1 in {
+      // Pseudo instruction XFLOADf32 will be expanded to LXSSPX or LFSX later
+      let CodeSize = 3 in
+      def XFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrr:$src),
+                              "#XFLOADf32",
+                              [(set f32:$XT, (load xoaddr:$src))]>;
+      // Pseudo instruction LIWAX will be expanded to LXSIWAX or LFIWAX later
+      def LIWAX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+                         "#LIWAX",
+                         [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+      // Pseudo instruction LIWZX will be expanded to LXSIWZX or LFIWZX later
+      def LIWZX : Pseudo<(outs vsfrc:$XT), (ins memrr:$src),
+                         "#LIWZX",
+                         [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+    }
   } // mayLoad
 
   // VSX scalar stores introduced in ISA 2.07
   let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
-                          "stxsspx $XT, $dst", IIC_LdStSTFD,
-                          [(store f32:$XT, xoaddr:$dst)]>;
+                          "stxsspx $XT, $dst", IIC_LdStSTFD, []>;
     def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
-                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
-                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+                          "stxsiwx $XT, $dst", IIC_LdStSTFD, []>;
+
+    // Please note let isPseudo = 1 is not part of class Pseudo<>. Missing it
+    // would cause these Pseudos are not expanded in expandPostRAPseudos()
+    let isPseudo = 1 in {
+      // Pseudo instruction XFSTOREf32 will be expanded to STXSSPX or STFSX later
+      let CodeSize = 3 in
+      def XFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrr:$dst),
+                              "#XFSTOREf32",
+                              [(store f32:$XT, xoaddr:$dst)]>;
+      // Pseudo instruction STIWX will be expanded to STXSIWX or STFIWX later
+      def STIWX : Pseudo<(outs), (ins vsfrc:$XT, memrr:$dst),
+                         "#STIWX",
+                        [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+    }
   } // mayStore
   } // UseVSXReg = 1
 
   def : Pat<(f64 (extloadf32 xoaddr:$src)),
-            (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
+            (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
   def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
-            (f32 (LXSSPX xoaddr:$src))>;
+            (f32 (XFLOADf32 xoaddr:$src))>;
   def : Pat<(f64 (fpextend f32:$src)),
             (COPY_TO_REGCLASS $src, VSFRC)>;
 
@@ -1407,7 +1453,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
             (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
   }
   def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
-            (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>;
+            (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;
 } // AddedComplexity = 400
 } // HasP8Vector
 
@@ -1769,6 +1815,7 @@ def VectorExtractions {
   dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
 }
 
+def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">;
 let AddedComplexity = 400 in {
 // v4f32 scalar <-> vector conversions (BE)
 let Predicates = [IsBigEndian, HasP8Vector] in {
@@ -1801,6 +1848,17 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
             (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+
+  // v2i64 scalar <-> vector conversions (BE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in {
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
             (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
@@ -1867,15 +1925,7 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
             (i32 VectorExtractions.LE_WORD_0)>;
   def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
             (i32 VectorExtractions.BE_VARIABLE_WORD)>;
-
-  // v2i64 scalar <-> vector conversions (BE)
-  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 VectorExtractions.LE_DWORD_1)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 VectorExtractions.LE_DWORD_0)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
-            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
-} // IsBigEndian, HasDirectMove
+} // IsBigEndian, HasDirectMove, NoP9Altivec
 
 // v4f32 scalar <-> vector conversions (LE)
 let Predicates = [IsLittleEndian, HasP8Vector] in {
@@ -1931,8 +1981,10 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in {
             (VEXTUWRX (LI8 0), $S)>;
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
             (VEXTUWRX (LI8 4), $S)>;
+  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
-            (VEXTUWRX (LI8 8), $S)>;
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+	    (i32 VectorExtractions.LE_WORD_2), sub_32)>;
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
             (VEXTUWRX (LI8 12), $S)>;
 
@@ -1942,11 +1994,82 @@ let Predicates = [HasP9Altivec, IsLittleEndian] in {
             (EXTSW (VEXTUWRX (LI8 0), $S))>;
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
             (EXTSW (VEXTUWRX (LI8 4), $S))>;
+  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
-            (EXTSW (VEXTUWRX (LI8 8), $S))>;
+            (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+	    (i32 VectorExtractions.LE_WORD_2), sub_32))>;
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
             (EXTSW (VEXTUWRX (LI8 12), $S))>;
+
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>;
+
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX
+	    (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>;
+
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUWRX
+	    (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>;
+  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>;
 }
+
 let Predicates = [HasP9Altivec, IsBigEndian] in {
   def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
             (VEXTUBLX $Idx, $S)>;
@@ -1974,8 +2097,11 @@ let Predicates = [HasP9Altivec, IsBigEndian] in {
             (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
             (VEXTUWLX (LI8 0), $S)>;
+
+  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
-            (VEXTUWLX (LI8 4), $S)>;
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+	    (i32 VectorExtractions.LE_WORD_2), sub_32)>;
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
             (VEXTUWLX (LI8 8), $S)>;
   def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
@@ -1985,12 +2111,82 @@ let Predicates = [HasP9Altivec, IsBigEndian] in {
             (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
             (EXTSW (VEXTUWLX (LI8 0), $S))>;
+  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
-            (EXTSW (VEXTUWLX (LI8 4), $S))>;
+            (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+	    (i32 VectorExtractions.LE_WORD_2), sub_32))>;
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
             (EXTSW (VEXTUWLX (LI8 8), $S))>;
   def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
             (EXTSW (VEXTUWLX (LI8 12), $S))>;
+
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>;
+
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX
+	    (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>;
+
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 (EXTRACT_SUBREG (VEXTUWLX
+	    (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>;
+  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>;
 }
 
 let Predicates = [IsLittleEndian, HasDirectMove] in {
@@ -2003,6 +2199,16 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
             (v4i32 MovesToVSR.LE_WORD_0)>;
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 MovesToVSR.LE_DWORD_0)>;
+  // v2i64 scalar <-> vector conversions (LE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in {
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
             (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
@@ -2069,15 +2275,7 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
             (i32 VectorExtractions.LE_WORD_3)>;
   def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
             (i32 VectorExtractions.LE_VARIABLE_WORD)>;
-
-  // v2i64 scalar <-> vector conversions (LE)
-  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 VectorExtractions.LE_DWORD_0)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 VectorExtractions.LE_DWORD_1)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
-            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
-} // IsLittleEndian, HasDirectMove
+} // IsLittleEndian, HasDirectMove, NoP9Altivec
 
 let Predicates = [HasDirectMove, HasVSX] in {
 // bitconvert f32 -> i32
@@ -2344,7 +2542,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
     XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
                      (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
                      "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
-                     [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB,
+                     [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
                                                    imm32SExt16:$UIM))]>,
                      RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
 
@@ -2550,6 +2748,51 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                     UseVSXReg;
   } // mayStore
 
+  let Predicates = [IsLittleEndian] in {
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+  }
+
+  let Predicates = [IsBigEndian] in {
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+  def: Pat<(f32 (PPCfcfids (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 0))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 1))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 2))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+  def: Pat<(f64 (PPCfcfid (PPCmtvsra (i32 (extractelt v4i32:$A, 3))))),
+           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+  }
+
+  // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+  // of f64
+  def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+            (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+  def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+            (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
   // Patterns for which instructions from ISA 3.0 are a better match
   let Predicates = [IsLittleEndian, HasP9Vector] in {
   def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
@@ -2560,6 +2803,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
   def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
             (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
@@ -2587,6 +2838,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
   def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
             (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f64 (PPCfcfidu (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
             (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
   def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
@@ -2809,6 +3068,23 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (f32 (DFLOADf32 ixaddr:$src))>;
 } // end HasP9Vector, AddedComplexity
 
+let Predicates = [HasP9Vector] in {
+  let isPseudo = 1 in {
+    let mayStore = 1 in {
+      def SPILLTOVSR_STX : Pseudo<(outs), (ins spilltovsrrc:$XT, memrr:$dst),
+                                "#SPILLTOVSR_STX", []>;
+      def SPILLTOVSR_ST : Pseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
+                                "#SPILLTOVSR_ST", []>;
+    }
+    let mayLoad = 1 in {
+      def SPILLTOVSR_LDX : Pseudo<(outs spilltovsrrc:$XT), (ins memrr:$src),
+                                "#SPILLTOVSR_LDX", []>;
+      def SPILLTOVSR_LD : Pseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
+                                "#SPILLTOVSR_LD", []>;
+
+    }
+  }
+}
 // Integer extend helper dags 32 -> 64
 def AnyExts {
   dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
@@ -2962,10 +3238,10 @@ let AddedComplexity = 400 in {
                                (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
     def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+                                (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
     def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+                                (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
     def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
               (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
 
@@ -2983,19 +3259,19 @@ let AddedComplexity = 400 in {
   }
 
   let Predicates = [HasVSX, NoP9Vector] in {
-    // Load-and-splat with fp-to-int conversion (using X-Form VSX loads).
+    // Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
     def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+                                (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
     def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+                                (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
     def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+                                              (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
     def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+                                              (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
   }
 
   // Big endian, available on all targets with VSX
diff --git a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index a349fa1b40907..cdf544bdfac35 100644
--- a/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -61,6 +62,8 @@ static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
                                  cl::Hidden, cl::init(16),
   cl::desc("Potential PHI threshold for PPC preinc loop prep"));
 
+STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form");
+
 namespace llvm {
 
   void initializePPCLoopPreIncPrepPass(PassRegistry&);
@@ -88,6 +91,9 @@ namespace {
       AU.addRequired<ScalarEvolutionWrapperPass>();
     }
 
+    bool alreadyPrepared(Loop *L, Instruction* MemI,
+                         const SCEV *BasePtrStartSCEV,
+                         const SCEVConstant *BasePtrIncSCEV);
     bool runOnFunction(Function &F) override;
 
     bool runOnLoop(Loop *L);
@@ -177,6 +183,62 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
   return MadeChange;
 }
 
+// In order to prepare for the pre-increment a PHI is added.
+// This function will check to see if that PHI already exists and will return
+//  true if it found an existing PHI with the same start and increment as the
+//  one we wanted to create.
+bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
+                                        const SCEV *BasePtrStartSCEV,
+                                        const SCEVConstant *BasePtrIncSCEV) {
+  BasicBlock *BB = MemI->getParent();
+  if (!BB)
+    return false;
+
+  BasicBlock *PredBB = L->getLoopPredecessor();
+  BasicBlock *LatchBB = L->getLoopLatch();
+
+  if (!PredBB || !LatchBB)
+    return false;
+
+  // Run through the PHIs and see if we have some that looks like a preparation
+  iterator_range<BasicBlock::phi_iterator> PHIIter = BB->phis();
+  for (auto & CurrentPHI : PHIIter) {
+    PHINode *CurrentPHINode = dyn_cast<PHINode>(&CurrentPHI);
+    if (!CurrentPHINode)
+      continue;
+
+    if (!SE->isSCEVable(CurrentPHINode->getType()))
+      continue;
+
+    const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L);
+
+    const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast<SCEVAddRecExpr>(PHISCEV);
+    if (!PHIBasePtrSCEV)
+      continue;
+
+    const SCEVConstant *PHIBasePtrIncSCEV =
+      dyn_cast<SCEVConstant>(PHIBasePtrSCEV->getStepRecurrence(*SE));
+    if (!PHIBasePtrIncSCEV)
+      continue;
+
+    if (CurrentPHINode->getNumIncomingValues() == 2) {
+      if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB &&
+            CurrentPHINode->getIncomingBlock(1) == PredBB) ||
+            (CurrentPHINode->getIncomingBlock(1) == LatchBB &&
+            CurrentPHINode->getIncomingBlock(0) == PredBB) ) {
+        if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV &&
+            PHIBasePtrIncSCEV == BasePtrIncSCEV) {
+          // The existing PHI (CurrentPHINode) has the same start and increment
+          //  as the PHI that we wanted to create.
+          ++PHINodeAlreadyExists;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
@@ -347,6 +409,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
 
     DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
 
+    if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV))
+      continue;
+
     PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
       MemI->hasName() ? MemI->getName() + ".phi" : "",
       Header->getFirstNonPHI());
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index b310493587ae7..1e40711328ece 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -20,14 +20,14 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
 static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
@@ -143,45 +143,48 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
   OutMI.setOpcode(MI->getOpcode());
   
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    
     MCOperand MCOp;
-    switch (MO.getType()) {
-    default:
-      MI->print(errs());
-      llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_Register:
-      assert(!MO.getSubReg() && "Subregs should be eliminated!");
-      assert(MO.getReg() > PPC::NoRegister &&
-             MO.getReg() < PPC::NUM_TARGET_REGS &&
-             "Invalid register for this target!");
-      MCOp = MCOperand::createReg(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::createImm(MO.getImm());
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
-                                      MO.getMBB()->getSymbol(), AP.OutContext));
-      break;
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
-      break;
-    case MachineOperand::MO_JumpTableIndex:
-      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
-      break;
-    case MachineOperand::MO_ConstantPoolIndex:
-      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
-      break;
-    case MachineOperand::MO_BlockAddress:
-      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
-                          isDarwin);
-      break;
-    case MachineOperand::MO_RegisterMask:
-      continue;
-    }
-    
-    OutMI.addOperand(MCOp);
+    if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
+                                          isDarwin))
+      OutMI.addOperand(MCOp);
+  }
+}
+
+bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
+                                             MCOperand &OutMO, AsmPrinter &AP,
+                                             bool isDarwin) {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    assert(MO.getReg() > PPC::NoRegister &&
+           MO.getReg() < PPC::NUM_TARGET_REGS &&
+           "Invalid register for this target!");
+    OutMO = MCOperand::createReg(MO.getReg());
+    return true;
+  case MachineOperand::MO_Immediate:
+    OutMO = MCOperand::createImm(MO.getImm());
+    return true;
+  case MachineOperand::MO_MachineBasicBlock:
+    OutMO = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+    return true;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+    return true;
+  case MachineOperand::MO_JumpTableIndex:
+    OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+    return true;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+    return true;
+  case MachineOperand::MO_BlockAddress:
+    OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
+                         isDarwin);
+    return true;
+  case MachineOperand::MO_RegisterMask:
+    return false;
   }
 }
diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp
index ff5f17c7628f2..a2640727f8138 100644
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -23,18 +23,50 @@
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include "MCTargetDesc/PPCPredicates.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-mi-peepholes"
 
-namespace llvm {
-  void initializePPCMIPeepholePass(PassRegistry&);
-}
+STATISTIC(RemoveTOCSave, "Number of TOC saves removed");
+STATISTIC(MultiTOCSaves,
+          "Number of functions with multiple TOC saves that must be kept");
+STATISTIC(NumEliminatedSExt, "Number of eliminated sign-extensions");
+STATISTIC(NumEliminatedZExt, "Number of eliminated zero-extensions");
+STATISTIC(NumOptADDLIs, "Number of optimized ADD instruction fed by LI");
+STATISTIC(NumConvertedToImmediateForm,
+          "Number of instructions converted to their immediate form");
+STATISTIC(NumFunctionsEnteredInMIPeephole,
+          "Number of functions entered in PPC MI Peepholes");
+STATISTIC(NumFixedPointIterations,
+          "Number of fixed-point iterations converting reg-reg instructions "
+          "to reg-imm ones");
+
+static cl::opt<bool>
+FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true),
+                   cl::desc("Iterate to a fixed point when attempting to "
+                            "convert reg-reg instructions to reg-imm"));
+
+static cl::opt<bool>
+ConvertRegReg("ppc-convert-rr-to-ri", cl::Hidden, cl::init(false),
+              cl::desc("Convert eligible reg+reg instructions to reg+imm"));
+
+static cl::opt<bool>
+    EnableSExtElimination("ppc-eliminate-signext",
+                          cl::desc("enable elimination of sign-extensions"),
+                          cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    EnableZExtElimination("ppc-eliminate-zeroext",
+                          cl::desc("enable elimination of zero-extensions"),
+                          cl::init(false), cl::Hidden);
 
 namespace {
 
@@ -50,20 +82,31 @@ struct PPCMIPeephole : public MachineFunctionPass {
   }
 
 private:
+  MachineDominatorTree *MDT;
+
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
   // Perform peepholes.
   bool simplifyCode(void);
 
-  // Find the "true" register represented by SrcReg (following chains
-  // of copies and subreg_to_reg operations).
-  unsigned lookThruCopyLike(unsigned SrcReg);
+  // Perform peepholes.
+  bool eliminateRedundantCompare(void);
+  bool eliminateRedundantTOCSaves(std::map<MachineInstr *, bool> &TOCSaves);
+  void UpdateTOCSaves(std::map<MachineInstr *, bool> &TOCSaves,
+                      MachineInstr *MI);
 
 public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
-    if (skipFunction(*MF.getFunction()))
+    if (skipFunction(MF.getFunction()))
       return false;
     initialize(MF);
     return simplifyCode();
@@ -74,15 +117,138 @@ public:
 void PPCMIPeephole::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
   MRI = &MF->getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
   DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
   DEBUG(MF->dump());
 }
 
+static MachineInstr *getVRegDefOrNull(MachineOperand *Op,
+                                      MachineRegisterInfo *MRI) {
+  assert(Op && "Invalid Operand!");
+  if (!Op->isReg())
+    return nullptr;
+
+  unsigned Reg = Op->getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  return MRI->getVRegDef(Reg);
+}
+
+// This function returns number of known zero bits in output of MI
+// starting from the most significant bit.
+static unsigned
+getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) {
+  unsigned Opcode = MI->getOpcode();
+  if (Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo ||
+      Opcode == PPC::RLDCL  || Opcode == PPC::RLDCLo)
+    return MI->getOperand(3).getImm();
+
+  if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) &&
+       MI->getOperand(3).getImm() <= 63 - MI->getOperand(2).getImm())
+    return MI->getOperand(3).getImm();
+
+  if ((Opcode == PPC::RLWINM  || Opcode == PPC::RLWINMo ||
+       Opcode == PPC::RLWNM   || Opcode == PPC::RLWNMo  ||
+       Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) &&
+       MI->getOperand(3).getImm() <= MI->getOperand(4).getImm())
+    return 32 + MI->getOperand(3).getImm();
+
+  if (Opcode == PPC::ANDIo) {
+    uint16_t Imm = MI->getOperand(2).getImm();
+    return 48 + countLeadingZeros(Imm);
+  }
+
+  if (Opcode == PPC::CNTLZW  || Opcode == PPC::CNTLZWo ||
+      Opcode == PPC::CNTTZW  || Opcode == PPC::CNTTZWo ||
+      Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8)
+    // The result ranges from 0 to 32.
+    return 58;
+
+  if (Opcode == PPC::CNTLZD  || Opcode == PPC::CNTLZDo ||
+      Opcode == PPC::CNTTZD  || Opcode == PPC::CNTTZDo)
+    // The result ranges from 0 to 64.
+    return 57;
+
+  if (Opcode == PPC::LHZ   || Opcode == PPC::LHZX  ||
+      Opcode == PPC::LHZ8  || Opcode == PPC::LHZX8 ||
+      Opcode == PPC::LHZU  || Opcode == PPC::LHZUX ||
+      Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8)
+    return 48;
+
+  if (Opcode == PPC::LBZ   || Opcode == PPC::LBZX  ||
+      Opcode == PPC::LBZ8  || Opcode == PPC::LBZX8 ||
+      Opcode == PPC::LBZU  || Opcode == PPC::LBZUX ||
+      Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8)
+    return 56;
+
+  if (TII->isZeroExtended(*MI))
+    return 32;
+
+  return 0;
+}
+
+// This function maintains a map for the pairs <TOC Save Instr, Keep>
+// Each time a new TOC save is encountered, it checks if any of the exisiting
+// ones are dominated by the new one. If so, it marks the exisiting one as
+// redundant by setting it's entry in the map as false. It then adds the new
+// instruction to the map with either true or false depending on if any
+// exisiting instructions dominated the new one.
+void PPCMIPeephole::UpdateTOCSaves(
+  std::map<MachineInstr *, bool> &TOCSaves, MachineInstr *MI) {
+  assert(TII->isTOCSaveMI(*MI) && "Expecting a TOC save instruction here");
+  bool Keep = true;
+  for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
+    MachineInstr *CurrInst = It->first;
+    // If new instruction dominates an exisiting one, mark exisiting one as
+    // redundant.
+    if (It->second && MDT->dominates(MI, CurrInst))
+      It->second = false;
+    // Check if the new instruction is redundant.
+    if (MDT->dominates(CurrInst, MI)) {
+      Keep = false;
+      break;
+    }
+  }
+  // Add new instruction to map.
+  TOCSaves[MI] = Keep;
+}
+
 // Perform peephole optimizations.
 bool PPCMIPeephole::simplifyCode(void) {
   bool Simplified = false;
   MachineInstr* ToErase = nullptr;
+  std::map<MachineInstr *, bool> TOCSaves;
+
+  NumFunctionsEnteredInMIPeephole++;
+  if (ConvertRegReg) {
+    // Fixed-point conversion of reg/reg instructions fed by load-immediate
+    // into reg/imm instructions. FIXME: This is expensive, control it with
+    // an option.
+    bool SomethingChanged = false;
+    do {
+      NumFixedPointIterations++;
+      SomethingChanged = false;
+      for (MachineBasicBlock &MBB : *MF) {
+        for (MachineInstr &MI : MBB) {
+          if (MI.isDebugValue())
+            continue;
+
+          if (TII->convertToImmediateForm(MI)) {
+            // We don't erase anything in case the def has other uses. Let DCE
+            // remove it if it can be removed.
+            DEBUG(dbgs() << "Converted instruction to imm form: ");
+            DEBUG(MI.dump());
+            NumConvertedToImmediateForm++;
+            SomethingChanged = true;
+            Simplified = true;
+            continue;
+          }
+        }
+      }
+    } while (SomethingChanged && FixedPointRegToImm);
+  }
 
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
@@ -104,6 +270,18 @@ bool PPCMIPeephole::simplifyCode(void) {
       default:
         break;
 
+      case PPC::STD: {
+        MachineFrameInfo &MFI = MF->getFrameInfo();
+        if (MFI.hasVarSizedObjects() ||
+            !MF->getSubtarget<PPCSubtarget>().isELFv2ABI())
+          break;
+        // When encountering a TOC save instruction, call UpdateTOCSaves
+        // to add it to the TOCSaves map and mark any exisiting TOC saves
+        // it dominates as redundant.
+        if (TII->isTOCSaveMI(MI))
+          UpdateTOCSaves(TOCSaves, &MI);
+        break;
+      }
       case PPC::XXPERMDI: {
         // Perform simplifications of 2x64 vector swaps and splats.
         // A swap is identified by an immediate value of 2, and a splat
@@ -118,8 +296,10 @@ bool PPCMIPeephole::simplifyCode(void) {
           //   XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
           // We have to look through chains of COPY and SUBREG_TO_REG
           // to find the real source values for comparison.
-          unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
-          unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+          unsigned TrueReg1 =
+            TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
+          unsigned TrueReg2 =
+            TII->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
 
           if (TrueReg1 == TrueReg2
               && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
@@ -133,7 +313,8 @@ bool PPCMIPeephole::simplifyCode(void) {
             auto isConversionOfLoadAndSplat = [=]() -> bool {
               if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
                 return false;
-              unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg());
+              unsigned DefReg =
+                TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
               if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
                 MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
                 if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
@@ -159,10 +340,10 @@ bool PPCMIPeephole::simplifyCode(void) {
             // can replace it with a copy.
             if (DefOpc == PPC::XXPERMDI) {
               unsigned FeedImmed = DefMI->getOperand(3).getImm();
-              unsigned FeedReg1
-                = lookThruCopyLike(DefMI->getOperand(1).getReg());
-              unsigned FeedReg2
-                = lookThruCopyLike(DefMI->getOperand(2).getReg());
+              unsigned FeedReg1 =
+                TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+              unsigned FeedReg2 =
+                TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
 
               if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
                 DEBUG(dbgs()
@@ -220,7 +401,8 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::XXSPLTW: {
         unsigned MyOpcode = MI.getOpcode();
         unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
-        unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+        unsigned TrueReg =
+          TII->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
         if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -282,7 +464,8 @@ bool PPCMIPeephole::simplifyCode(void) {
       }
       case PPC::XVCVDPSP: {
         // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
-        unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg());
+        unsigned TrueReg =
+          TII->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
         if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
           break;
         MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
@@ -290,8 +473,10 @@ bool PPCMIPeephole::simplifyCode(void) {
         // This can occur when building a vector of single precision or integer
         // values.
         if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
-          unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg());
-          unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg());
+          unsigned DefsReg1 =
+            TII->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
+          unsigned DefsReg2 =
+            TII->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
           if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
               !TargetRegisterInfo::isVirtualRegister(DefsReg2))
             break;
@@ -336,8 +521,248 @@ bool PPCMIPeephole::simplifyCode(void) {
         }
         break;
       }
+      case PPC::EXTSH:
+      case PPC::EXTSH8:
+      case PPC::EXTSH8_32_64: {
+        if (!EnableSExtElimination) break;
+        unsigned NarrowReg = MI.getOperand(1).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+          break;
+
+        MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
+        // If we've used a zero-extending load that we will sign-extend,
+        // just do a sign-extending load.
+        if (SrcMI->getOpcode() == PPC::LHZ ||
+            SrcMI->getOpcode() == PPC::LHZX) {
+          if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg()))
+            break;
+          auto is64Bit = [] (unsigned Opcode) {
+            return Opcode == PPC::EXTSH8;
+          };
+          auto isXForm = [] (unsigned Opcode) {
+            return Opcode == PPC::LHZX;
+          };
+          auto getSextLoadOp = [] (bool is64Bit, bool isXForm) {
+            if (is64Bit)
+              if (isXForm) return PPC::LHAX8;
+              else         return PPC::LHA8;
+            else
+              if (isXForm) return PPC::LHAX;
+              else         return PPC::LHA;
+          };
+          unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
+                                       isXForm(SrcMI->getOpcode()));
+          DEBUG(dbgs() << "Zero-extending load\n");
+          DEBUG(SrcMI->dump());
+          DEBUG(dbgs() << "and sign-extension\n");
+          DEBUG(MI.dump());
+          DEBUG(dbgs() << "are merged into sign-extending load\n");
+          SrcMI->setDesc(TII->get(Opc));
+          SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+          ToErase = &MI;
+          Simplified = true;
+          NumEliminatedSExt++;
+        }
+        break;
+      }
+      case PPC::EXTSW:
+      case PPC::EXTSW_32:
+      case PPC::EXTSW_32_64: {
+        if (!EnableSExtElimination) break;
+        unsigned NarrowReg = MI.getOperand(1).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(NarrowReg))
+          break;
+
+        MachineInstr *SrcMI = MRI->getVRegDef(NarrowReg);
+        // If we've used a zero-extending load that we will sign-extend,
+        // just do a sign-extending load.
+        if (SrcMI->getOpcode() == PPC::LWZ ||
+            SrcMI->getOpcode() == PPC::LWZX) {
+          if (!MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg()))
+            break;
+          auto is64Bit = [] (unsigned Opcode) {
+            return Opcode == PPC::EXTSW || Opcode == PPC::EXTSW_32_64;
+          };
+          auto isXForm = [] (unsigned Opcode) {
+            return Opcode == PPC::LWZX;
+          };
+          auto getSextLoadOp = [] (bool is64Bit, bool isXForm) {
+            if (is64Bit)
+              if (isXForm) return PPC::LWAX;
+              else         return PPC::LWA;
+            else
+              if (isXForm) return PPC::LWAX_32;
+              else         return PPC::LWA_32;
+          };
+          unsigned Opc = getSextLoadOp(is64Bit(MI.getOpcode()),
+                                       isXForm(SrcMI->getOpcode()));
+          DEBUG(dbgs() << "Zero-extending load\n");
+          DEBUG(SrcMI->dump());
+          DEBUG(dbgs() << "and sign-extension\n");
+          DEBUG(MI.dump());
+          DEBUG(dbgs() << "are merged into sign-extending load\n");
+          SrcMI->setDesc(TII->get(Opc));
+          SrcMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+          ToErase = &MI;
+          Simplified = true;
+          NumEliminatedSExt++;
+        } else if (MI.getOpcode() == PPC::EXTSW_32_64 &&
+                   TII->isSignExtended(*SrcMI)) {
+          // We can eliminate EXTSW if the input is known to be already
+          // sign-extended.
+          DEBUG(dbgs() << "Removing redundant sign-extension\n");
+          unsigned TmpReg =
+            MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass);
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::IMPLICIT_DEF),
+                  TmpReg);
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::INSERT_SUBREG),
+                  MI.getOperand(0).getReg())
+              .addReg(TmpReg)
+              .addReg(NarrowReg)
+              .addImm(PPC::sub_32);
+          ToErase = &MI;
+          Simplified = true;
+          NumEliminatedSExt++;
+        }
+        break;
+      }
+      case PPC::RLDICL: {
+        // We can eliminate RLDICL (e.g. for zero-extension)
+        // if all bits to clear are already zero in the input.
+        // This code assume following code sequence for zero-extension.
+        //   %6 = COPY %5:sub_32; (optional)
+        //   %8 = IMPLICIT_DEF;
+        //   %7<def,tied1> = INSERT_SUBREG %8<tied0>, %6, sub_32;
+        if (!EnableZExtElimination) break;
+
+        if (MI.getOperand(2).getImm() != 0)
+          break;
+
+        unsigned SrcReg = MI.getOperand(1).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+          break;
+
+        MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+        if (!(SrcMI && SrcMI->getOpcode() == PPC::INSERT_SUBREG &&
+              SrcMI->getOperand(0).isReg() && SrcMI->getOperand(1).isReg()))
+          break;
+
+        MachineInstr *ImpDefMI, *SubRegMI;
+        ImpDefMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+        SubRegMI = MRI->getVRegDef(SrcMI->getOperand(2).getReg());
+        if (ImpDefMI->getOpcode() != PPC::IMPLICIT_DEF) break;
+
+        SrcMI = SubRegMI;
+        if (SubRegMI->getOpcode() == PPC::COPY) {
+          unsigned CopyReg = SubRegMI->getOperand(1).getReg();
+          if (TargetRegisterInfo::isVirtualRegister(CopyReg))
+            SrcMI = MRI->getVRegDef(CopyReg);
+        }
+
+        unsigned KnownZeroCount = getKnownLeadingZeroCount(SrcMI, TII);
+        if (MI.getOperand(3).getImm() <= KnownZeroCount) {
+          DEBUG(dbgs() << "Removing redundant zero-extension\n");
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                  MI.getOperand(0).getReg())
+              .addReg(SrcReg);
+          ToErase = &MI;
+          Simplified = true;
+          NumEliminatedZExt++;
+        }
+        break;
+      }
+
+      // TODO: Any instruction that has an immediate form fed only by a PHI
+      // whose operands are all load immediate can be folded away. We currently
+      // do this for ADD instructions, but should expand it to arithmetic and
+      // binary instructions with immediate forms in the future.
+      case PPC::ADD4:
+      case PPC::ADD8: {
+        auto isSingleUsePHI = [&](MachineOperand *PhiOp) {
+          assert(PhiOp && "Invalid Operand!");
+          MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI);
+
+          return DefPhiMI && (DefPhiMI->getOpcode() == PPC::PHI) &&
+                 MRI->hasOneNonDBGUse(DefPhiMI->getOperand(0).getReg());
+        };
+
+        auto dominatesAllSingleUseLIs = [&](MachineOperand *DominatorOp,
+                                            MachineOperand *PhiOp) {
+          assert(PhiOp && "Invalid Operand!");
+          assert(DominatorOp && "Invalid Operand!");
+          MachineInstr *DefPhiMI = getVRegDefOrNull(PhiOp, MRI);
+          MachineInstr *DefDomMI = getVRegDefOrNull(DominatorOp, MRI);
+
+          // Note: the vregs only show up at odd indices position of PHI Node,
+          // the even indices position save the BB info.
+          for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
+            MachineInstr *LiMI =
+                getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
+            if (!LiMI ||
+                (LiMI->getOpcode() != PPC::LI && LiMI->getOpcode() != PPC::LI8)
+                || !MRI->hasOneNonDBGUse(LiMI->getOperand(0).getReg()) ||
+                !MDT->dominates(DefDomMI, LiMI))
+              return false;
+          }
+
+          return true;
+        };
+
+        MachineOperand Op1 = MI.getOperand(1);
+        MachineOperand Op2 = MI.getOperand(2);
+        if (isSingleUsePHI(&Op2) && dominatesAllSingleUseLIs(&Op1, &Op2))
+          std::swap(Op1, Op2);
+        else if (!isSingleUsePHI(&Op1) || !dominatesAllSingleUseLIs(&Op2, &Op1))
+          break; // We don't have an ADD fed by LI's that can be transformed
+
+        // Now we know that Op1 is the PHI node and Op2 is the dominator
+        unsigned DominatorReg = Op2.getReg();
+
+        const TargetRegisterClass *TRC = MI.getOpcode() == PPC::ADD8
+                                             ? &PPC::G8RC_and_G8RC_NOX0RegClass
+                                             : &PPC::GPRC_and_GPRC_NOR0RegClass;
+        MRI->setRegClass(DominatorReg, TRC);
+
+        // replace LIs with ADDIs
+        MachineInstr *DefPhiMI = getVRegDefOrNull(&Op1, MRI);
+        for (unsigned i = 1; i < DefPhiMI->getNumOperands(); i += 2) {
+          MachineInstr *LiMI = getVRegDefOrNull(&DefPhiMI->getOperand(i), MRI);
+          DEBUG(dbgs() << "Optimizing LI to ADDI: ");
+          DEBUG(LiMI->dump());
+
+          // There could be repeated registers in the PHI, e.g: %1 =
+          // PHI %6, <%bb.2>, %8, <%bb.3>, %8, <%bb.6>; So if we've
+          // already replaced the def instruction, skip.
+          if (LiMI->getOpcode() == PPC::ADDI || LiMI->getOpcode() == PPC::ADDI8)
+            continue;
+
+          assert((LiMI->getOpcode() == PPC::LI ||
+                  LiMI->getOpcode() == PPC::LI8) &&
+                 "Invalid Opcode!");
+          auto LiImm = LiMI->getOperand(1).getImm(); // save the imm of LI
+          LiMI->RemoveOperand(1);                    // remove the imm of LI
+          LiMI->setDesc(TII->get(LiMI->getOpcode() == PPC::LI ? PPC::ADDI
+                                                              : PPC::ADDI8));
+          MachineInstrBuilder(*LiMI->getParent()->getParent(), *LiMI)
+              .addReg(DominatorReg)
+              .addImm(LiImm); // restore the imm of LI
+          DEBUG(LiMI->dump());
+        }
+
+        // Replace ADD with COPY
+        DEBUG(dbgs() << "Optimizing ADD to COPY: ");
+        DEBUG(MI.dump());
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                MI.getOperand(0).getReg())
+            .add(Op1);
+        ToErase = &MI;
+        Simplified = true;
+        NumOptADDLIs++;
+        break;
+      }
       }
     }
+
     // If the last instruction was marked for elimination,
     // remove it now.
     if (ToErase) {
@@ -346,37 +771,502 @@ bool PPCMIPeephole::simplifyCode(void) {
     }
   }
 
+  // Eliminate all the TOC save instructions which are redundant.
+  Simplified |= eliminateRedundantTOCSaves(TOCSaves);
+  // We try to eliminate redundant compare instruction.
+  Simplified |= eliminateRedundantCompare();
+
   return Simplified;
 }
 
-// This is used to find the "true" source register for an
-// XXPERMDI instruction, since MachineCSE does not handle the
-// "copy-like" operations (Copy and SubregToReg).  Returns
-// the original SrcReg unless it is the target of a copy-like
-// operation, in which case we chain backwards through all
-// such operations to the ultimate source register.  If a
-// physical register is encountered, we stop the search.
-unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+// helper functions for eliminateRedundantCompare
+static bool isEqOrNe(MachineInstr *BI) {
+  PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+  unsigned PredCond = PPC::getPredicateCondition(Pred);
+  return (PredCond == PPC::PRED_EQ || PredCond == PPC::PRED_NE);
+}
+
+static bool isSupportedCmpOp(unsigned opCode) {
+  return (opCode == PPC::CMPLD  || opCode == PPC::CMPD  ||
+          opCode == PPC::CMPLW  || opCode == PPC::CMPW  ||
+          opCode == PPC::CMPLDI || opCode == PPC::CMPDI ||
+          opCode == PPC::CMPLWI || opCode == PPC::CMPWI);
+}
+
+static bool is64bitCmpOp(unsigned opCode) {
+  return (opCode == PPC::CMPLD  || opCode == PPC::CMPD ||
+          opCode == PPC::CMPLDI || opCode == PPC::CMPDI);
+}
+
+static bool isSignedCmpOp(unsigned opCode) {
+  return (opCode == PPC::CMPD  || opCode == PPC::CMPW ||
+          opCode == PPC::CMPDI || opCode == PPC::CMPWI);
+}
+
+static unsigned getSignedCmpOpCode(unsigned opCode) {
+  if (opCode == PPC::CMPLD)  return PPC::CMPD;
+  if (opCode == PPC::CMPLW)  return PPC::CMPW;
+  if (opCode == PPC::CMPLDI) return PPC::CMPDI;
+  if (opCode == PPC::CMPLWI) return PPC::CMPWI;
+  return opCode;
+}
+
+// We can decrement immediate x in (GE x) by changing it to (GT x-1) or
+// (LT x) to (LE x-1)
+static unsigned getPredicateToDecImm(MachineInstr *BI, MachineInstr *CMPI) {
+  uint64_t Imm = CMPI->getOperand(2).getImm();
+  bool SignedCmp = isSignedCmpOp(CMPI->getOpcode());
+  if ((!SignedCmp && Imm == 0) || (SignedCmp && Imm == 0x8000))
+    return 0;
+
+  PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+  unsigned PredCond = PPC::getPredicateCondition(Pred);
+  unsigned PredHint = PPC::getPredicateHint(Pred);
+  if (PredCond == PPC::PRED_GE)
+    return PPC::getPredicate(PPC::PRED_GT, PredHint);
+  if (PredCond == PPC::PRED_LT)
+    return PPC::getPredicate(PPC::PRED_LE, PredHint);
+
+  return 0;
+}
+
+// We can increment immediate x in (GT x) by changing it to (GE x+1) or
+// (LE x) to (LT x+1)
+static unsigned getPredicateToIncImm(MachineInstr *BI, MachineInstr *CMPI) {
+  uint64_t Imm = CMPI->getOperand(2).getImm();
+  bool SignedCmp = isSignedCmpOp(CMPI->getOpcode());
+  if ((!SignedCmp && Imm == 0xFFFF) || (SignedCmp && Imm == 0x7FFF))
+    return 0;
+
+  PPC::Predicate Pred = (PPC::Predicate)BI->getOperand(0).getImm();
+  unsigned PredCond = PPC::getPredicateCondition(Pred);
+  unsigned PredHint = PPC::getPredicateHint(Pred);
+  if (PredCond == PPC::PRED_GT)
+    return PPC::getPredicate(PPC::PRED_GE, PredHint);
+  if (PredCond == PPC::PRED_LE)
+    return PPC::getPredicate(PPC::PRED_LT, PredHint);
+
+  return 0;
+}
+
+// This takes a Phi node and returns a register value for the spefied BB.
+static unsigned getIncomingRegForBlock(MachineInstr *Phi,
+                                       MachineBasicBlock *MBB) {
+  for (unsigned I = 2, E = Phi->getNumOperands() + 1; I != E; I += 2) {
+    MachineOperand &MO = Phi->getOperand(I);
+    if (MO.getMBB() == MBB)
+      return Phi->getOperand(I-1).getReg();
+  }
+  llvm_unreachable("invalid src basic block for this Phi node\n");
+  return 0;
+}
+
+// This function tracks the source of the register through register copy.
+// If BB1 and BB2 are non-NULL, we also track PHI instruction in BB2
+// assuming that the control comes from BB1 into BB2.
+static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1,
+                           MachineBasicBlock *BB2, MachineRegisterInfo *MRI) {
+  unsigned SrcReg = Reg;
+  while (1) {
+    unsigned NextReg = SrcReg;
+    MachineInstr *Inst = MRI->getVRegDef(SrcReg);
+    if (BB1 && Inst->getOpcode() == PPC::PHI && Inst->getParent() == BB2) {
+      NextReg = getIncomingRegForBlock(Inst, BB1);
+      // We track through PHI only once to avoid infinite loop.
+      BB1 = nullptr;
+    }
+    else if (Inst->isFullCopy())
+      NextReg = Inst->getOperand(1).getReg();
+    if (NextReg == SrcReg || !TargetRegisterInfo::isVirtualRegister(NextReg))
+      break;
+    SrcReg = NextReg;
+  }
+  return SrcReg;
+}
+
+static bool eligibleForCompareElimination(MachineBasicBlock &MBB,
+                                          MachineBasicBlock *&PredMBB,
+                                          MachineBasicBlock *&MBBtoMoveCmp,
+                                          MachineRegisterInfo *MRI) {
+
+  auto isEligibleBB = [&](MachineBasicBlock &BB) {
+    auto BII = BB.getFirstInstrTerminator();
+    // We optimize BBs ending with a conditional branch.
+    // We check only for BCC here, not BCCLR, because BCCLR
+    // will be formed only later in the pipeline. 
+    if (BB.succ_size() == 2 &&
+        BII != BB.instr_end() &&
+        (*BII).getOpcode() == PPC::BCC &&
+        (*BII).getOperand(1).isReg()) {
+      // We optimize only if the condition code is used only by one BCC.
+      unsigned CndReg = (*BII).getOperand(1).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(CndReg) ||
+          !MRI->hasOneNonDBGUse(CndReg))
+        return false;
+
+      MachineInstr *CMPI = MRI->getVRegDef(CndReg);
+      // We assume compare and branch are in the same BB for ease of analysis.
+      if (CMPI->getParent() != &BB)
+        return false;
+
+      // We skip this BB if a physical register is used in comparison.
+      for (MachineOperand &MO : CMPI->operands())
+        if (MO.isReg() && !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+          return false;
+
+      return true;
+    }
+    return false;
+  };
+
+  // If this BB has more than one successor, we can create a new BB and
+  // move the compare instruction in the new BB.
+  // So far, we do not move compare instruction to a BB having multiple
+  // successors to avoid potentially increasing code size.
+  auto isEligibleForMoveCmp = [](MachineBasicBlock &BB) {
+    return BB.succ_size() == 1;
+  };
+
+  if (!isEligibleBB(MBB))
+    return false;
+
+  unsigned NumPredBBs = MBB.pred_size();
+  if (NumPredBBs == 1) {
+    MachineBasicBlock *TmpMBB = *MBB.pred_begin();
+    if (isEligibleBB(*TmpMBB)) {
+      PredMBB = TmpMBB;
+      MBBtoMoveCmp = nullptr;
+      return true;
+    }
+  }
+  else if (NumPredBBs == 2) {
+    // We check for partially redundant case.
+    // So far, we support cases with only two predecessors
+    // to avoid increasing the number of instructions.
+    MachineBasicBlock::pred_iterator PI = MBB.pred_begin();
+    MachineBasicBlock *Pred1MBB = *PI;
+    MachineBasicBlock *Pred2MBB = *(PI+1);
+
+    if (isEligibleBB(*Pred1MBB) && isEligibleForMoveCmp(*Pred2MBB)) {
+      // We assume Pred1MBB is the BB containing the compare to be merged and
+      // Pred2MBB is the BB to which we will append a compare instruction.
+      // Hence we can proceed as is.
+    }
+    else if (isEligibleBB(*Pred2MBB) && isEligibleForMoveCmp(*Pred1MBB)) {
+      // We need to swap Pred1MBB and Pred2MBB to canonicalize.
+      std::swap(Pred1MBB, Pred2MBB);
+    }
+    else return false;
+
+    // Here, Pred2MBB is the BB to which we need to append a compare inst.
+    // We cannot move the compare instruction if operands are not available
+    // in Pred2MBB (i.e. defined in MBB by an instruction other than PHI).
+    MachineInstr *BI = &*MBB.getFirstInstrTerminator();
+    MachineInstr *CMPI = MRI->getVRegDef(BI->getOperand(1).getReg());
+    for (int I = 1; I <= 2; I++)
+      if (CMPI->getOperand(I).isReg()) {
+        MachineInstr *Inst = MRI->getVRegDef(CMPI->getOperand(I).getReg());
+        if (Inst->getParent() == &MBB && Inst->getOpcode() != PPC::PHI)
+          return false;
+      }
+
+    PredMBB = Pred1MBB;
+    MBBtoMoveCmp = Pred2MBB;
+    return true;
+  }
+
+  return false;
+}
+
+// This function will iterate over the input map containing a pair of TOC save
+// instruction and a flag. The flag will be set to false if the TOC save is proven
+// redundant. This function will erase from the basic block all the TOC saves
+// marked as redundant.
+bool PPCMIPeephole::eliminateRedundantTOCSaves(
+    std::map<MachineInstr *, bool> &TOCSaves) {
+  bool Simplified = false;
+  int NumKept = 0;
+  for (auto TOCSave : TOCSaves) {
+    if (!TOCSave.second) {
+      TOCSave.first->eraseFromParent();
+      RemoveTOCSave++;
+      Simplified = true;
+    } else {
+      NumKept++;
+    }
+  }
 
-  while (true) {
+  if (NumKept > 1)
+    MultiTOCSaves++;
 
-    MachineInstr *MI = MRI->getVRegDef(SrcReg);
-    if (!MI->isCopyLike())
-      return SrcReg;
+  return Simplified;
+}
 
-    unsigned CopySrcReg;
-    if (MI->isCopy())
-      CopySrcReg = MI->getOperand(1).getReg();
+// If multiple conditional branches are executed based on the (essentially)
+// same comparison, we merge compare instructions into one and make multiple
+// conditional branches on this comparison.
+// For example,
+//   if (a == 0) { ... }
+//   else if (a < 0) { ... }
+// can be executed by one compare and two conditional branches instead of
+// two pairs of a compare and a conditional branch.
+//
+// This method merges two compare instructions in two MBBs and modifies the
+// compare and conditional branch instructions if needed.
+// For the above example, the input for this pass looks like:
+//   cmplwi r3, 0
+//   beq    0, .LBB0_3
+//   cmpwi  r3, -1
+//   bgt    0, .LBB0_4
+// So, before merging two compares, we need to modify these instructions as
+//   cmpwi  r3, 0       ; cmplwi and cmpwi yield same result for beq
+//   beq    0, .LBB0_3
+//   cmpwi  r3, 0       ; greather than -1 means greater or equal to 0
+//   bge    0, .LBB0_4
+
+bool PPCMIPeephole::eliminateRedundantCompare(void) {
+  // FIXME: this transformation is causing miscompiles. Disabling it for now
+  // until we can resolve the issue.
+  return false;
+  bool Simplified = false;
+
+  for (MachineBasicBlock &MBB2 : *MF) {
+    MachineBasicBlock *MBB1 = nullptr, *MBBtoMoveCmp = nullptr;
+
+    // For fully redundant case, we select two basic blocks MBB1 and MBB2
+    // as an optimization target if
+    // - both MBBs end with a conditional branch,
+    // - MBB1 is the only predecessor of MBB2, and
+    // - compare does not take a physical register as a operand in both MBBs.
+    // In this case, eligibleForCompareElimination sets MBBtoMoveCmp nullptr.
+    //
+    // As partially redundant case, we additionally handle if MBB2 has one
+    // additional predecessor, which has only one successor (MBB2).
+    // In this case, we move the compare instruction originally in MBB2 into
+    // MBBtoMoveCmp. This partially redundant case is typically appear by
+    // compiling a while loop; here, MBBtoMoveCmp is the loop preheader.
+    //
+    // Overview of CFG of related basic blocks
+    // Fully redundant case        Partially redundant case
+    //   --------                   ----------------  --------
+    //   | MBB1 | (w/ 2 succ)       | MBBtoMoveCmp |  | MBB1 | (w/ 2 succ)
+    //   --------                   ----------------  --------
+    //      |    \                     (w/ 1 succ) \     |    \
+    //      |     \                                 \    |     \
+    //      |                                        \   |
+    //   --------                                     --------
+    //   | MBB2 | (w/ 1 pred                          | MBB2 | (w/ 2 pred
+    //   -------- and 2 succ)                         -------- and 2 succ)
+    //      |    \                                       |    \
+    //      |     \                                      |     \
+    //
+    if (!eligibleForCompareElimination(MBB2, MBB1, MBBtoMoveCmp, MRI))
+      continue;
+
+    MachineInstr *BI1   = &*MBB1->getFirstInstrTerminator();
+    MachineInstr *CMPI1 = MRI->getVRegDef(BI1->getOperand(1).getReg());
+
+    MachineInstr *BI2   = &*MBB2.getFirstInstrTerminator();
+    MachineInstr *CMPI2 = MRI->getVRegDef(BI2->getOperand(1).getReg());
+    bool IsPartiallyRedundant = (MBBtoMoveCmp != nullptr);
+
+    // We cannot optimize an unsupported compare opcode or
+    // a mix of 32-bit and 64-bit comaprisons
+    if (!isSupportedCmpOp(CMPI1->getOpcode()) ||
+        !isSupportedCmpOp(CMPI2->getOpcode()) ||
+        is64bitCmpOp(CMPI1->getOpcode()) != is64bitCmpOp(CMPI2->getOpcode()))
+      continue;
+
+    unsigned NewOpCode = 0;
+    unsigned NewPredicate1 = 0, NewPredicate2 = 0;
+    int16_t Imm1 = 0, NewImm1 = 0, Imm2 = 0, NewImm2 = 0;
+    bool SwapOperands = false;
+
+    if (CMPI1->getOpcode() != CMPI2->getOpcode()) {
+      // Typically, unsigned comparison is used for equality check, but
+      // we replace it with a signed comparison if the comparison
+      // to be merged is a signed comparison.
+      // In other cases of opcode mismatch, we cannot optimize this.
+      if (isEqOrNe(BI2) &&
+          CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode()))
+        NewOpCode = CMPI1->getOpcode();
+      else if (isEqOrNe(BI1) &&
+               getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode())
+        NewOpCode = CMPI2->getOpcode();
+      else continue;
+    }
+
+    if (CMPI1->getOperand(2).isReg() && CMPI2->getOperand(2).isReg()) {
+      // In case of comparisons between two registers, these two registers
+      // must be same to merge two comparisons.
+      unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(),
+                                         nullptr, nullptr, MRI);
+      unsigned Cmp1Operand2 = getSrcVReg(CMPI1->getOperand(2).getReg(),
+                                         nullptr, nullptr, MRI);
+      unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(),
+                                         MBB1, &MBB2, MRI);
+      unsigned Cmp2Operand2 = getSrcVReg(CMPI2->getOperand(2).getReg(),
+                                         MBB1, &MBB2, MRI);
+
+      if (Cmp1Operand1 == Cmp2Operand1 && Cmp1Operand2 == Cmp2Operand2) {
+        // Same pair of registers in the same order; ready to merge as is.
+      }
+      else if (Cmp1Operand1 == Cmp2Operand2 && Cmp1Operand2 == Cmp2Operand1) {
+        // Same pair of registers in different order.
+        // We reverse the predicate to merge compare instructions.
+        PPC::Predicate Pred = (PPC::Predicate)BI2->getOperand(0).getImm();
+        NewPredicate2 = (unsigned)PPC::getSwappedPredicate(Pred);
+        // In case of partial redundancy, we need to swap operands
+        // in another compare instruction.
+        SwapOperands = true;
+      }
+      else continue;
+    }
+    else if (CMPI1->getOperand(2).isImm() && CMPI2->getOperand(2).isImm()) {
+      // In case of comparisons between a register and an immediate,
+      // the operand register must be same for two compare instructions.
+      unsigned Cmp1Operand1 = getSrcVReg(CMPI1->getOperand(1).getReg(),
+                                         nullptr, nullptr, MRI);
+      unsigned Cmp2Operand1 = getSrcVReg(CMPI2->getOperand(1).getReg(),
+                                         MBB1, &MBB2, MRI);
+      if (Cmp1Operand1 != Cmp2Operand1)
+        continue;
+
+      NewImm1 = Imm1 = (int16_t)CMPI1->getOperand(2).getImm();
+      NewImm2 = Imm2 = (int16_t)CMPI2->getOperand(2).getImm();
+
+      // If immediate are not same, we try to adjust by changing predicate;
+      // e.g. GT imm means GE (imm+1).
+      if (Imm1 != Imm2 && (!isEqOrNe(BI2) || !isEqOrNe(BI1))) {
+        int Diff = Imm1 - Imm2;
+        if (Diff < -2 || Diff > 2)
+          continue;
+
+        unsigned PredToInc1 = getPredicateToIncImm(BI1, CMPI1);
+        unsigned PredToDec1 = getPredicateToDecImm(BI1, CMPI1);
+        unsigned PredToInc2 = getPredicateToIncImm(BI2, CMPI2);
+        unsigned PredToDec2 = getPredicateToDecImm(BI2, CMPI2);
+        if (Diff == 2) {
+          if (PredToInc2 && PredToDec1) {
+            NewPredicate2 = PredToInc2;
+            NewPredicate1 = PredToDec1;
+            NewImm2++;
+            NewImm1--;
+          }
+        }
+        else if (Diff == 1) {
+          if (PredToInc2) {
+            NewImm2++;
+            NewPredicate2 = PredToInc2;
+          }
+          else if (PredToDec1) {
+            NewImm1--;
+            NewPredicate1 = PredToDec1;
+          }
+        }
+        else if (Diff == -1) {
+          if (PredToDec2) {
+            NewImm2--;
+            NewPredicate2 = PredToDec2;
+          }
+          else if (PredToInc1) {
+            NewImm1++;
+            NewPredicate1 = PredToInc1;
+          }
+        }
+        else if (Diff == -2) {
+          if (PredToDec2 && PredToInc1) {
+            NewPredicate2 = PredToDec2;
+            NewPredicate1 = PredToInc1;
+            NewImm2--;
+            NewImm1++;
+          }
+        }
+      }
+
+      // We cannnot merge two compares if the immediates are not same.
+      if (NewImm2 != NewImm1)
+        continue;
+    }
+
+    DEBUG(dbgs() << "Optimize two pairs of compare and branch:\n");
+    DEBUG(CMPI1->dump());
+    DEBUG(BI1->dump());
+    DEBUG(CMPI2->dump());
+    DEBUG(BI2->dump());
+
+    // We adjust opcode, predicates and immediate as we determined above.
+    if (NewOpCode != 0 && NewOpCode != CMPI1->getOpcode()) {
+      CMPI1->setDesc(TII->get(NewOpCode));
+    }
+    if (NewPredicate1) {
+      BI1->getOperand(0).setImm(NewPredicate1);
+    }
+    if (NewPredicate2) {
+      BI2->getOperand(0).setImm(NewPredicate2);
+    }
+    if (NewImm1 != Imm1) {
+      CMPI1->getOperand(2).setImm(NewImm1);
+    }
+
+    if (IsPartiallyRedundant) {
+      // We touch up the compare instruction in MBB2 and move it to
+      // a previous BB to handle partially redundant case.
+      if (SwapOperands) {
+        unsigned Op1 = CMPI2->getOperand(1).getReg();
+        unsigned Op2 = CMPI2->getOperand(2).getReg();
+        CMPI2->getOperand(1).setReg(Op2);
+        CMPI2->getOperand(2).setReg(Op1);
+      }
+      if (NewImm2 != Imm2)
+        CMPI2->getOperand(2).setImm(NewImm2);
+
+      for (int I = 1; I <= 2; I++) {
+        if (CMPI2->getOperand(I).isReg()) {
+          MachineInstr *Inst = MRI->getVRegDef(CMPI2->getOperand(I).getReg());
+          if (Inst->getParent() != &MBB2)
+            continue;
+
+          assert(Inst->getOpcode() == PPC::PHI &&
+                 "We cannot support if an operand comes from this BB.");
+          unsigned SrcReg = getIncomingRegForBlock(Inst, MBBtoMoveCmp);
+          CMPI2->getOperand(I).setReg(SrcReg);
+        }
+      }
+      auto I = MachineBasicBlock::iterator(MBBtoMoveCmp->getFirstTerminator());
+      MBBtoMoveCmp->splice(I, &MBB2, MachineBasicBlock::iterator(CMPI2));
+
+      DebugLoc DL = CMPI2->getDebugLoc();
+      unsigned NewVReg = MRI->createVirtualRegister(&PPC::CRRCRegClass);
+      BuildMI(MBB2, MBB2.begin(), DL,
+              TII->get(PPC::PHI), NewVReg)
+        .addReg(BI1->getOperand(1).getReg()).addMBB(MBB1)
+        .addReg(BI2->getOperand(1).getReg()).addMBB(MBBtoMoveCmp);
+      BI2->getOperand(1).setReg(NewVReg);
+    }
     else {
-      assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
-      CopySrcReg = MI->getOperand(2).getReg();
+      // We finally eliminate compare instruction in MBB2.
+      BI2->getOperand(1).setReg(BI1->getOperand(1).getReg());
+      CMPI2->eraseFromParent();
     }
+    BI2->getOperand(1).setIsKill(true);
+    BI1->getOperand(1).setIsKill(false);
 
-    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
-      return CopySrcReg;
+    DEBUG(dbgs() << "into a compare and two branches:\n");
+    DEBUG(CMPI1->dump());
+    DEBUG(BI1->dump());
+    DEBUG(BI2->dump());
+    if (IsPartiallyRedundant) {
+      DEBUG(dbgs() << "The following compare is moved into "
+                   << printMBBReference(*MBBtoMoveCmp)
+                   << " to handle partial redundancy.\n");
+      DEBUG(CMPI2->dump());
+    }
 
-    SrcReg = CopySrcReg;
+    Simplified = true;
   }
+
+  return Simplified;
 }
 
 } // end default namespace
diff --git a/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
new file mode 100644
index 0000000000000..628ea2ab9fe62
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineBasicBlockUtils.h
@@ -0,0 +1,198 @@
+//==-- PPCMachineBasicBlockUtils.h - Functions for common MBB operations ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility functions for commonly used operations on
+// MachineBasicBlock's.
+// NOTE: Include this file after defining DEBUG_TYPE so that the debug messages
+//       can be emitted for the pass that is using this.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
+#define LLVM_LIB_TARGET_PPC_MACHINE_BASIC_BLOCK_UTILS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#ifndef DEBUG_TYPE
+#define DEBUG_TYPE "ppc-generic-mbb-utilities"
+#endif
+
+using namespace llvm;
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for any incoming values in the PHIs that are supposed to
+/// be coming from \p OrigMBB but whose definition is actually in \p NewMBB.
+/// Any such PHIs will be updated to reflect reality.
+static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB,
+                       MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) {
+  for (auto &MI : Successor->instrs()) {
+    if (!MI.isPHI())
+      continue;
+    // This is a really ugly-looking loop, but it was pillaged directly from
+    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+    for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == OrigMBB) {
+        // Check if the instruction is actualy defined in NewMBB.
+        if (MI.getOperand(i-1).isReg()) {
+          MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i-1).getReg());
+          if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) {
+            MO.setMBB(NewMBB);
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+/// Given a basic block \p Successor that potentially contains PHIs, this
+/// function will look for PHIs that have an incoming value from \p OrigMBB
+/// and will add the same incoming value from \p NewMBB.
+/// NOTE: This should only be used if \p NewMBB is an immediate dominator of
+/// \p OrigMBB.
+static void addIncomingValuesToPHIs(MachineBasicBlock *Successor,
+                                    MachineBasicBlock *OrigMBB,
+                                    MachineBasicBlock *NewMBB,
+                                    MachineRegisterInfo *MRI) {
+  assert(OrigMBB->isSuccessor(NewMBB) && "NewMBB must be a sucessor of OrigMBB");
+  for (auto &MI : Successor->instrs()) {
+    if (!MI.isPHI())
+      continue;
+    // This is a really ugly-looking loop, but it was pillaged directly from
+    // MachineBasicBlock::transferSuccessorsAndUpdatePHIs().
+    for (unsigned i = 2, e = MI.getNumOperands()+1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == OrigMBB) {
+        MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+        MIB.addReg(MI.getOperand(i-1).getReg()).addMBB(NewMBB);
+        break;
+      }
+    }
+  }
+}
+
+struct BlockSplitInfo {
+  MachineInstr *OrigBranch;
+  MachineInstr *SplitBefore;
+  MachineInstr *SplitCond;
+  bool InvertNewBranch;
+  bool InvertOrigBranch;
+  bool BranchToFallThrough;
+  const MachineBranchProbabilityInfo *MBPI;
+  MachineInstr *MIToDelete;
+  MachineInstr *NewCond;
+  bool allInstrsInSameMBB() {
+    if (!OrigBranch || !SplitBefore || !SplitCond)
+      return false;
+    MachineBasicBlock *MBB = OrigBranch->getParent();
+    if (SplitBefore->getParent() != MBB ||
+        SplitCond->getParent() != MBB)
+      return false;
+    if (MIToDelete && MIToDelete->getParent() != MBB)
+      return false;
+    if (NewCond && NewCond->getParent() != MBB)
+      return false;
+    return true;
+  }
+};
+
+/// Splits a MachineBasicBlock to branch before \p SplitBefore. The original
+/// branch is \p OrigBranch. The target of the new branch can either be the same
+/// as the target of the original branch or the fallthrough successor of the
+/// original block as determined by \p BranchToFallThrough. The branch
+/// conditions will be inverted according to \p InvertNewBranch and
+/// \p InvertOrigBranch. If an instruction that previously fed the branch is to
+/// be deleted, it is provided in \p MIToDelete and \p NewCond will be used as
+/// the branch condition. The branch probabilities will be set if the
+/// MachineBranchProbabilityInfo isn't null.
+static bool splitMBB(BlockSplitInfo &BSI) {
+  assert(BSI.allInstrsInSameMBB() &&
+         "All instructions must be in the same block.");
+
+  MachineBasicBlock *ThisMBB = BSI.OrigBranch->getParent();
+  MachineFunction *MF = ThisMBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  assert(MRI->isSSA() && "Can only do this while the function is in SSA form.");
+  if (ThisMBB->succ_size() != 2) {
+    DEBUG(dbgs() << "Don't know how to handle blocks that don't have exactly"
+                 << " two succesors.\n");
+    return false;
+  }
+
+  const PPCInstrInfo *TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  unsigned OrigBROpcode = BSI.OrigBranch->getOpcode();
+  unsigned InvertedOpcode =
+    OrigBROpcode == PPC::BC ? PPC::BCn :
+    OrigBROpcode == PPC::BCn ? PPC::BC :
+    OrigBROpcode == PPC::BCLR ? PPC::BCLRn : PPC::BCLR;
+  unsigned NewBROpcode = BSI.InvertNewBranch ? InvertedOpcode : OrigBROpcode;
+  MachineBasicBlock *OrigTarget = BSI.OrigBranch->getOperand(1).getMBB();
+  MachineBasicBlock *OrigFallThrough =
+    OrigTarget == *ThisMBB->succ_begin() ? *ThisMBB->succ_rbegin() :
+    *ThisMBB->succ_begin();
+  MachineBasicBlock *NewBRTarget =
+    BSI.BranchToFallThrough ? OrigFallThrough : OrigTarget;
+  BranchProbability ProbToNewTarget =
+    !BSI.MBPI ? BranchProbability::getUnknown() :
+    BSI.MBPI->getEdgeProbability(ThisMBB, NewBRTarget);
+
+  // Create a new basic block.
+  MachineBasicBlock::iterator InsertPoint = BSI.SplitBefore;
+  const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+  MachineFunction::iterator It = ThisMBB->getIterator();
+  MachineBasicBlock *NewMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(++It, NewMBB);
+
+  // Move everything after SplitBefore into the new block.
+  NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end());
+  NewMBB->transferSuccessors(ThisMBB);
+
+  // Add the two successors to ThisMBB. The probabilities come from the
+  // existing blocks if available.
+  ThisMBB->addSuccessor(NewBRTarget, ProbToNewTarget);
+  ThisMBB->addSuccessor(NewMBB, ProbToNewTarget.getCompl());
+
+  // Add the branches to ThisMBB.
+  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+          TII->get(NewBROpcode)).addReg(BSI.SplitCond->getOperand(0).getReg())
+          .addMBB(NewBRTarget);
+  BuildMI(*ThisMBB, ThisMBB->end(), BSI.SplitBefore->getDebugLoc(),
+          TII->get(PPC::B)).addMBB(NewMBB);
+  if (BSI.MIToDelete)
+    BSI.MIToDelete->eraseFromParent();
+
+  // Change the condition on the original branch and invert it if requested.
+  auto FirstTerminator = NewMBB->getFirstTerminator();
+  if (BSI.NewCond) {
+    assert(FirstTerminator->getOperand(0).isReg() &&
+           "Can't update condition of unconditional branch.");
+    FirstTerminator->getOperand(0).setReg(BSI.NewCond->getOperand(0).getReg());
+  }
+  if (BSI.InvertOrigBranch)
+    FirstTerminator->setDesc(TII->get(InvertedOpcode));
+
+  // If any of the PHIs in the successors of NewMBB reference values that
+  // now come from NewMBB, they need to be updated.
+  for (auto *Succ : NewMBB->successors()) {
+    updatePHIs(Succ, ThisMBB, NewMBB, MRI);
+  }
+  addIncomingValuesToPHIs(NewBRTarget, ThisMBB, NewMBB, MRI);
+
+  DEBUG(dbgs() << "After splitting, ThisMBB:\n"; ThisMBB->dump());
+  DEBUG(dbgs() << "NewMBB:\n"; NewMBB->dump());
+  DEBUG(dbgs() << "New branch-to block:\n"; NewBRTarget->dump());
+  return true;
+}
+
+
+#endif
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index bc2d9a08b5e86..3923417257e8c 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -43,3 +43,17 @@ MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
                                            "func_toc" +
                                            Twine(MF.getFunctionNumber()));
 }
+
+bool PPCFunctionInfo::isLiveInSExt(unsigned VReg) const {
+  for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+    if (LiveIn.first == VReg)
+      return LiveIn.second.isSExt();
+  return false;
+}
+
+bool PPCFunctionInfo::isLiveInZExt(unsigned VReg) const {
+  for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+    if (LiveIn.first == VReg)
+      return LiveIn.second.isZExt();
+  return false;
+}
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 202e10058b733..a9b6073106eae 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
 
 namespace llvm {
 
@@ -113,6 +114,10 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// copies
   bool IsSplitCSR = false;
 
+  /// We keep track attributes for each live-in virtual registers
+  /// to use SExt/ZExt flags in later optimization.
+  std::vector<std::pair<unsigned, ISD::ArgFlagsTy>> LiveInAttrs;
+
 public:
   explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {}
 
@@ -175,6 +180,19 @@ public:
   unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
   void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
 
+  /// This function associates attributes for each live-in virtual register.
+  void addLiveInAttr(unsigned VReg, ISD::ArgFlagsTy Flags) {
+    LiveInAttrs.push_back(std::make_pair(VReg, Flags));
+  }
+
+  /// This function returns true if the spesified vreg is
+  /// a live-in register and sign-extended.
+  bool isLiveInSExt(unsigned VReg) const;
+
+  /// This function returns true if the spesified vreg is
+  /// a live-in register and zero-extended.
+  bool isLiveInZExt(unsigned VReg) const;
+
   int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
   void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
 
diff --git a/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
new file mode 100644
index 0000000000000..9501f0f89b81b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -0,0 +1,95 @@
+//===--------- PPCPreEmitPeephole.cpp - Late peephole optimizations -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pre-emit peephole for catching opportunities introduced by late passes such
+// as MachineBlockPlacement.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-pre-emit-peephole"
+
+STATISTIC(NumRRConvertedInPreEmit,
+          "Number of r+r instructions converted to r+i in pre-emit peephole");
+STATISTIC(NumRemovedInPreEmit,
+          "Number of instructions deleted in pre-emit peephole");
+
+static cl::opt<bool>
+RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(false),
+                   cl::desc("Run pre-emit peephole optimizations."));
+
+namespace {
+  class PPCPreEmitPeephole : public MachineFunctionPass {
+  public:
+    static char ID;
+    PPCPreEmitPeephole() : MachineFunctionPass(ID) {
+      initializePPCPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole)
+        return false;
+      bool Changed = false;
+      const PPCInstrInfo *TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+      SmallVector<MachineInstr *, 4> InstrsToErase;
+      for (MachineBasicBlock &MBB : MF) {
+        for (MachineInstr &MI : MBB) {
+          MachineInstr *DefMIToErase = nullptr;
+          if (TII->convertToImmediateForm(MI, &DefMIToErase)) {
+            Changed = true;
+            NumRRConvertedInPreEmit++;
+            DEBUG(dbgs() << "Converted instruction to imm form: ");
+            DEBUG(MI.dump());
+            if (DefMIToErase) {
+              InstrsToErase.push_back(DefMIToErase);
+            }
+          }
+        }
+      }
+      for (MachineInstr *MI : InstrsToErase) {
+        DEBUG(dbgs() << "PPC pre-emit peephole: erasing instruction: ");
+        DEBUG(MI->dump());
+        MI->eraseFromParent();
+        NumRemovedInPreEmit++;
+      }
+      return Changed;
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCPreEmitPeephole, DEBUG_TYPE, "PowerPC Pre-Emit Peephole",
+                false, false)
+char PPCPreEmitPeephole::ID = 0;
+
+FunctionPass *llvm::createPPCPreEmitPeepholePass() {
+  return new PPCPreEmitPeephole();
+}
diff --git a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
index 8a18ab9e0e9a3..25b2b54cbe98a 100644
--- a/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
+++ b/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -22,9 +22,9 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-qpx-load-splat"
@@ -60,7 +60,7 @@ FunctionPass *llvm::createPPCQPXLoadSplatPass() {
 }
 
 bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(*MF.getFunction()))
+  if (skipFunction(MF.getFunction()))
     return false;
 
   bool MadeChange = false;
@@ -79,8 +79,8 @@ bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // We're looking for a sequence like this:
-      // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
-      // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+      // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2)
+      // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm
 
       for (auto SI = Splats.begin(); SI != Splats.end();) {
         MachineInstr *SMI = *SI;
diff --git a/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
new file mode 100644
index 0000000000000..5b2d7191683c0
--- /dev/null
+++ b/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -0,0 +1,535 @@
+//===---- PPCReduceCRLogicals.cpp - Reduce CR Bit Logical operations ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass aims to reduce the number of logical operations on bits in the CR
+// register. These instructions have a fairly high latency and only a single
+// pipeline at their disposal in modern PPC cores. Furthermore, they have a
+// tendency to occur in fairly small blocks where there's little opportunity
+// to hide the latency between the CR logical operation and its user.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-reduce-cr-ops"
+#include "PPCMachineBasicBlockUtils.h"
+
+STATISTIC(NumContainedSingleUseBinOps,
+          "Number of single-use binary CR logical ops contained in a block");
+STATISTIC(NumToSplitBlocks,
+          "Number of binary CR logical ops that can be used to split blocks");
+STATISTIC(TotalCRLogicals, "Number of CR logical ops.");
+STATISTIC(TotalNullaryCRLogicals,
+          "Number of nullary CR logical ops (CRSET/CRUNSET).");
+STATISTIC(TotalUnaryCRLogicals, "Number of unary CR logical ops.");
+STATISTIC(TotalBinaryCRLogicals, "Number of CR logical ops.");
+STATISTIC(NumBlocksSplitOnBinaryCROp,
+          "Number of blocks split on CR binary logical ops.");
+STATISTIC(NumNotSplitIdenticalOperands,
+          "Number of blocks not split due to operands being identical.");
+STATISTIC(NumNotSplitChainCopies,
+          "Number of blocks not split due to operands being chained copies.");
+STATISTIC(NumNotSplitWrongOpcode,
+          "Number of blocks not split due to the wrong opcode.");
+
+namespace llvm {
+  void initializePPCReduceCRLogicalsPass(PassRegistry&);
+}
+
+namespace {
+
+static bool isBinary(MachineInstr &MI) {
+  return MI.getNumOperands() == 3;
+}
+
+static bool isNullary(MachineInstr &MI) {
+  return MI.getNumOperands() == 1;
+}
+
+/// Given a CR logical operation \p CROp, branch opcode \p BROp as well as
+/// a flag to indicate if the first operand of \p CROp is used as the
+/// SplitBefore operand, determines whether either of the branches are to be
+/// inverted as well as whether the new target should be the original
+/// fall-through block.
+static void
+computeBranchTargetAndInversion(unsigned CROp, unsigned BROp, bool UsingDef1,
+                                bool &InvertNewBranch, bool &InvertOrigBranch,
+                                bool &TargetIsFallThrough) {
+  // The conditions under which each of the output operands should be [un]set
+  // can certainly be written much more concisely with just 3 if statements or
+  // ternary expressions. However, this provides a much clearer overview to the
+  // reader as to what is set for each <CROp, BROp, OpUsed> combination.
+  if (BROp == PPC::BC || BROp == PPC::BCLR) {
+    // Regular branches.
+    switch (CROp) {
+    default:
+      llvm_unreachable("Don't know how to handle this CR logical.");
+    case PPC::CROR:
+      InvertNewBranch = false;
+      InvertOrigBranch = false;
+      TargetIsFallThrough = false;
+      return;
+    case PPC::CRAND:
+      InvertNewBranch = true;
+      InvertOrigBranch = false;
+      TargetIsFallThrough = true;
+      return;
+    case PPC::CRNAND:
+      InvertNewBranch = true;
+      InvertOrigBranch = true;
+      TargetIsFallThrough = false;
+      return;
+    case PPC::CRNOR:
+      InvertNewBranch = false;
+      InvertOrigBranch = true;
+      TargetIsFallThrough = true;
+      return;
+    case PPC::CRORC:
+      InvertNewBranch = UsingDef1;
+      InvertOrigBranch = !UsingDef1;
+      TargetIsFallThrough = false;
+      return;
+    case PPC::CRANDC:
+      InvertNewBranch = !UsingDef1;
+      InvertOrigBranch = !UsingDef1;
+      TargetIsFallThrough = true;
+      return;
+    }
+  } else if (BROp == PPC::BCn || BROp == PPC::BCLRn) {
+    // Negated branches.
+    switch (CROp) {
+    default:
+      llvm_unreachable("Don't know how to handle this CR logical.");
+    case PPC::CROR:
+      InvertNewBranch = true;
+      InvertOrigBranch = false;
+      TargetIsFallThrough = true;
+      return;
+    case PPC::CRAND:
+      InvertNewBranch = false;
+      InvertOrigBranch = false;
+      TargetIsFallThrough = false;
+      return;
+    case PPC::CRNAND:
+      InvertNewBranch = false;
+      InvertOrigBranch = true;
+      TargetIsFallThrough = true;
+      return;
+    case PPC::CRNOR:
+      InvertNewBranch = true;
+      InvertOrigBranch = true;
+      TargetIsFallThrough = false;
+      return;
+    case PPC::CRORC:
+      InvertNewBranch = !UsingDef1;
+      InvertOrigBranch = !UsingDef1;
+      TargetIsFallThrough = true;
+      return;
+    case PPC::CRANDC:
+      InvertNewBranch = UsingDef1;
+      InvertOrigBranch = !UsingDef1;
+      TargetIsFallThrough = false;
+      return;
+    }
+  } else
+    llvm_unreachable("Don't know how to handle this branch.");
+}
+
+class PPCReduceCRLogicals : public MachineFunctionPass {
+
+public:
+  static char ID;
+  struct CRLogicalOpInfo {
+    MachineInstr *MI;
+    // FIXME: If chains of copies are to be handled, this should be a vector.
+    std::pair<MachineInstr*, MachineInstr*> CopyDefs;
+    std::pair<MachineInstr*, MachineInstr*> TrueDefs;
+    unsigned IsBinary : 1;
+    unsigned IsNullary : 1;
+    unsigned ContainedInBlock : 1;
+    unsigned FeedsISEL : 1;
+    unsigned FeedsBR : 1;
+    unsigned FeedsLogical : 1;
+    unsigned SingleUse : 1;
+    unsigned DefsSingleUse : 1;
+    unsigned SubregDef1;
+    unsigned SubregDef2;
+    CRLogicalOpInfo() : MI(nullptr), IsBinary(0), IsNullary(0),
+                        ContainedInBlock(0), FeedsISEL(0), FeedsBR(0),
+                        FeedsLogical(0), SingleUse(0), DefsSingleUse(1),
+                        SubregDef1(0), SubregDef2(0) { }
+    void dump();
+  };
+
+private:
+  const PPCInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+  const MachineBranchProbabilityInfo *MBPI;
+
+  // A vector to contain all the CR logical operations
+  std::vector<CRLogicalOpInfo> AllCRLogicalOps;
+  void initialize(MachineFunction &MFParm);
+  void collectCRLogicals();
+  bool handleCROp(CRLogicalOpInfo &CRI);
+  bool splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI);
+  static bool isCRLogical(MachineInstr &MI) {
+    unsigned Opc = MI.getOpcode();
+    return Opc == PPC::CRAND || Opc == PPC::CRNAND || Opc == PPC::CROR ||
+      Opc == PPC::CRXOR || Opc == PPC::CRNOR || Opc == PPC::CREQV ||
+      Opc == PPC::CRANDC || Opc == PPC::CRORC || Opc == PPC::CRSET ||
+      Opc == PPC::CRUNSET || Opc == PPC::CR6SET || Opc == PPC::CR6UNSET;
+  }
+  bool simplifyCode() {
+    bool Changed = false;
+    // Not using a range-based for loop here as the vector may grow while being
+    // operated on.
+    for (unsigned i = 0; i < AllCRLogicalOps.size(); i++)
+      Changed |= handleCROp(AllCRLogicalOps[i]);
+    return Changed;
+  }
+
+public:
+  PPCReduceCRLogicals() : MachineFunctionPass(ID) {
+    initializePPCReduceCRLogicalsPass(*PassRegistry::getPassRegistry());
+  }
+
+  MachineInstr *lookThroughCRCopy(unsigned Reg, unsigned &Subreg,
+                                  MachineInstr *&CpDef);
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    // If the subtarget doesn't use CR bits, there's nothing to do.
+    const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+    if (!STI.useCRBits())
+      return false;
+
+    initialize(MF);
+    collectCRLogicals();
+    return simplifyCode();
+  }
+  CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PPCReduceCRLogicals::CRLogicalOpInfo::dump() {
+  dbgs() << "CRLogicalOpMI: ";
+  MI->dump();
+  dbgs() << "IsBinary: " << IsBinary << ", FeedsISEL: " << FeedsISEL;
+  dbgs() << ", FeedsBR: " << FeedsBR << ", FeedsLogical: ";
+  dbgs() << FeedsLogical << ", SingleUse: " << SingleUse;
+  dbgs() << ", DefsSingleUse: " << DefsSingleUse;
+  dbgs() << ", SubregDef1: " << SubregDef1 << ", SubregDef2: ";
+  dbgs() << SubregDef2 << ", ContainedInBlock: " << ContainedInBlock;
+  if (!IsNullary) {
+    dbgs() << "\nDefs:\n";
+    TrueDefs.first->dump();
+  }
+  if (IsBinary)
+    TrueDefs.second->dump();
+  dbgs() << "\n";
+  if (CopyDefs.first) {
+    dbgs() << "CopyDef1: ";
+    CopyDefs.first->dump();
+  }
+  if (CopyDefs.second) {
+    dbgs() << "CopyDef2: ";
+    CopyDefs.second->dump();
+  }
+}
+#endif
+
+PPCReduceCRLogicals::CRLogicalOpInfo
+PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) {
+  CRLogicalOpInfo Ret;
+  Ret.MI = &MIParam;
+  // Get the defs
+  if (isNullary(MIParam)) {
+    Ret.IsNullary = 1;
+    Ret.TrueDefs = std::make_pair(nullptr, nullptr);
+    Ret.CopyDefs = std::make_pair(nullptr, nullptr);
+  } else {
+    MachineInstr *Def1 = lookThroughCRCopy(MIParam.getOperand(1).getReg(),
+                                           Ret.SubregDef1, Ret.CopyDefs.first);
+    Ret.DefsSingleUse &=
+      MRI->hasOneNonDBGUse(Def1->getOperand(0).getReg());
+    Ret.DefsSingleUse &=
+      MRI->hasOneNonDBGUse(Ret.CopyDefs.first->getOperand(0).getReg());
+    assert(Def1 && "Must be able to find a definition of operand 1.");
+    if (isBinary(MIParam)) {
+      Ret.IsBinary = 1;
+      MachineInstr *Def2 = lookThroughCRCopy(MIParam.getOperand(2).getReg(),
+                                             Ret.SubregDef2,
+                                             Ret.CopyDefs.second);
+      Ret.DefsSingleUse &=
+        MRI->hasOneNonDBGUse(Def2->getOperand(0).getReg());
+      Ret.DefsSingleUse &=
+        MRI->hasOneNonDBGUse(Ret.CopyDefs.second->getOperand(0).getReg());
+      assert(Def2 && "Must be able to find a definition of operand 2.");
+      Ret.TrueDefs = std::make_pair(Def1, Def2);
+    } else {
+      Ret.TrueDefs = std::make_pair(Def1, nullptr);
+      Ret.CopyDefs.second = nullptr;
+    }
+  }
+
+  Ret.ContainedInBlock = 1;
+  // Get the uses
+  for (MachineInstr &UseMI :
+       MRI->use_nodbg_instructions(MIParam.getOperand(0).getReg())) {
+    unsigned Opc = UseMI.getOpcode();
+    if (Opc == PPC::ISEL || Opc == PPC::ISEL8)
+      Ret.FeedsISEL = 1;
+    if (Opc == PPC::BC || Opc == PPC::BCn || Opc == PPC::BCLR ||
+        Opc == PPC::BCLRn)
+      Ret.FeedsBR = 1;
+    Ret.FeedsLogical = isCRLogical(UseMI);
+    if (UseMI.getParent() != MIParam.getParent())
+      Ret.ContainedInBlock = 0;
+  }
+  Ret.SingleUse = MRI->hasOneNonDBGUse(MIParam.getOperand(0).getReg()) ? 1 : 0;
+
+  // We now know whether all the uses of the CR logical are in the same block.
+  if (!Ret.IsNullary) {
+    Ret.ContainedInBlock &=
+      (MIParam.getParent() == Ret.TrueDefs.first->getParent());
+    if (Ret.IsBinary)
+      Ret.ContainedInBlock &=
+        (MIParam.getParent() == Ret.TrueDefs.second->getParent());
+  }
+  DEBUG(Ret.dump());
+  if (Ret.IsBinary && Ret.ContainedInBlock && Ret.SingleUse) {
+    NumContainedSingleUseBinOps++;
+    if (Ret.FeedsBR && Ret.DefsSingleUse)
+      NumToSplitBlocks++;
+  }
+  return Ret;
+}
+
+/// Looks trhough a COPY instruction to the actual definition of the CR-bit
+/// register and returns the instruction that defines it.
+/// FIXME: This currently handles what is by-far the most common case:
+/// an instruction that defines a CR field followed by a single copy of a bit
+/// from that field into a virtual register. If chains of copies need to be
+/// handled, this should have a loop until a non-copy instruction is found.
+MachineInstr *PPCReduceCRLogicals::lookThroughCRCopy(unsigned Reg,
+                                                     unsigned &Subreg,
+                                                     MachineInstr *&CpDef) {
+  Subreg = -1;
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+  MachineInstr *Copy = MRI->getVRegDef(Reg);
+  CpDef = Copy;
+  if (!Copy->isCopy())
+    return Copy;
+  unsigned CopySrc = Copy->getOperand(1).getReg();
+  Subreg = Copy->getOperand(1).getSubReg();
+  if (!TargetRegisterInfo::isVirtualRegister(CopySrc)) {
+    const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+    // Set the Subreg
+    if (CopySrc == PPC::CR0EQ || CopySrc == PPC::CR6EQ)
+      Subreg = PPC::sub_eq;
+    if (CopySrc == PPC::CR0LT || CopySrc == PPC::CR6LT)
+      Subreg = PPC::sub_lt;
+    if (CopySrc == PPC::CR0GT || CopySrc == PPC::CR6GT)
+      Subreg = PPC::sub_gt;
+    if (CopySrc == PPC::CR0UN || CopySrc == PPC::CR6UN)
+      Subreg = PPC::sub_un;
+    // Loop backwards and return the first MI that modifies the physical CR Reg.
+    MachineBasicBlock::iterator Me = Copy, B = Copy->getParent()->begin();
+    while (Me != B)
+      if ((--Me)->modifiesRegister(CopySrc, TRI))
+        return &*Me;
+    return nullptr;
+  }
+  return MRI->getVRegDef(CopySrc);
+}
+
+void PPCReduceCRLogicals::initialize(MachineFunction &MFParam) {
+  MF = &MFParam;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  AllCRLogicalOps.clear();
+}
+
+/// Contains all the implemented transformations on CR logical operations.
+/// For example, a binary CR logical can be used to split a block on its inputs,
+/// a unary CR logical might be used to change the condition code on a
+/// comparison feeding it. A nullary CR logical might simply be removable
+/// if the user of the bit it [un]sets can be transformed.
+bool PPCReduceCRLogicals::handleCROp(CRLogicalOpInfo &CRI) {
+  // We can definitely split a block on the inputs to a binary CR operation
+  // whose defs and (single) use are within the same block.
+  bool Changed = false;
+  if (CRI.IsBinary && CRI.ContainedInBlock && CRI.SingleUse && CRI.FeedsBR &&
+      CRI.DefsSingleUse) {
+    Changed = splitBlockOnBinaryCROp(CRI);
+    if (Changed)
+      NumBlocksSplitOnBinaryCROp++;
+  }
+  return Changed;
+}
+
+/// Splits a block that contains a CR-logical operation that feeds a branch
+/// and whose operands are produced within the block.
+/// Example:
+///    %vr5<def> = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2
+///    %vr6<def> = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5
+///    %vr7<def> = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3
+///    %vr8<def> = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7
+///    %vr9<def> = CROR %vr6<kill>, %vr8<kill>; CRBITRC:%vr9,%vr6,%vr8
+///    BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
+/// Becomes:
+///    %vr5<def> = CMPDI %vr2, 0; CRRC:%vr5 G8RC:%vr2
+///    %vr6<def> = COPY %vr5:sub_eq; CRBITRC:%vr6 CRRC:%vr5
+///    BC %vr6<kill>, <BB#2>; CRBITRC:%vr6
+///
+///    %vr7<def> = CMPDI %vr3, 0; CRRC:%vr7 G8RC:%vr3
+///    %vr8<def> = COPY %vr7:sub_eq; CRBITRC:%vr8 CRRC:%vr7
+///    BC %vr9<kill>, <BB#2>; CRBITRC:%vr9
+bool PPCReduceCRLogicals::splitBlockOnBinaryCROp(CRLogicalOpInfo &CRI) {
+  if (CRI.CopyDefs.first == CRI.CopyDefs.second) {
+    DEBUG(dbgs() << "Unable to split as the two operands are the same\n");
+    NumNotSplitIdenticalOperands++;
+    return false;
+  }
+  if (CRI.TrueDefs.first->isCopy() || CRI.TrueDefs.second->isCopy() ||
+      CRI.TrueDefs.first->isPHI() || CRI.TrueDefs.second->isPHI()) {
+    DEBUG(dbgs() << "Unable to split because one of the operands is a PHI or "
+          "chain of copies.\n");
+    NumNotSplitChainCopies++;
+    return false;
+  }
+  // Note: keep in sync with computeBranchTargetAndInversion().
+  if (CRI.MI->getOpcode() != PPC::CROR &&
+      CRI.MI->getOpcode() != PPC::CRAND &&
+      CRI.MI->getOpcode() != PPC::CRNOR &&
+      CRI.MI->getOpcode() != PPC::CRNAND &&
+      CRI.MI->getOpcode() != PPC::CRORC &&
+      CRI.MI->getOpcode() != PPC::CRANDC) {
+    DEBUG(dbgs() << "Unable to split blocks on this opcode.\n");
+    NumNotSplitWrongOpcode++;
+    return false;
+  }
+  DEBUG(dbgs() << "Splitting the following CR op:\n"; CRI.dump());
+  MachineBasicBlock::iterator Def1It = CRI.TrueDefs.first;
+  MachineBasicBlock::iterator Def2It = CRI.TrueDefs.second;
+
+  bool UsingDef1 = false;
+  MachineInstr *SplitBefore = &*Def2It;
+  for (auto E = CRI.MI->getParent()->end(); Def2It != E; ++Def2It) {
+    if (Def1It == Def2It) { // Def2 comes before Def1.
+      SplitBefore = &*Def1It;
+      UsingDef1 = true;
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "We will split the following block:\n";);
+  DEBUG(CRI.MI->getParent()->dump());
+  DEBUG(dbgs() << "Before instruction:\n"; SplitBefore->dump());
+
+  // Get the branch instruction.
+  MachineInstr *Branch =
+    MRI->use_nodbg_begin(CRI.MI->getOperand(0).getReg())->getParent();
+
+  // We want the new block to have no code in it other than the definition
+  // of the input to the CR logical and the CR logical itself. So we move
+  // those to the bottom of the block (just before the branch). Then we
+  // will split before the CR logical.
+  MachineBasicBlock *MBB = SplitBefore->getParent();
+  auto FirstTerminator = MBB->getFirstTerminator();
+  MachineBasicBlock::iterator FirstInstrToMove =
+    UsingDef1 ? CRI.TrueDefs.first : CRI.TrueDefs.second;
+  MachineBasicBlock::iterator SecondInstrToMove =
+    UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second;
+
+  // The instructions that need to be moved are not guaranteed to be
+  // contiguous. Move them individually.
+  // FIXME: If one of the operands is a chain of (single use) copies, they
+  // can all be moved and we can still split.
+  MBB->splice(FirstTerminator, MBB, FirstInstrToMove);
+  if (FirstInstrToMove != SecondInstrToMove)
+    MBB->splice(FirstTerminator, MBB, SecondInstrToMove);
+  MBB->splice(FirstTerminator, MBB, CRI.MI);
+
+  unsigned Opc = CRI.MI->getOpcode();
+  bool InvertOrigBranch, InvertNewBranch, TargetIsFallThrough;
+  computeBranchTargetAndInversion(Opc, Branch->getOpcode(), UsingDef1,
+                                  InvertNewBranch, InvertOrigBranch,
+                                  TargetIsFallThrough);
+  MachineInstr *SplitCond =
+    UsingDef1 ? CRI.CopyDefs.second : CRI.CopyDefs.first;
+  DEBUG(dbgs() << "We will " <<  (InvertNewBranch ? "invert" : "copy"));
+  DEBUG(dbgs() << " the original branch and the target is the " <<
+        (TargetIsFallThrough ? "fallthrough block\n" : "orig. target block\n"));
+  DEBUG(dbgs() << "Original branch instruction: "; Branch->dump());
+  BlockSplitInfo BSI { Branch, SplitBefore, SplitCond, InvertNewBranch,
+    InvertOrigBranch, TargetIsFallThrough, MBPI, CRI.MI,
+    UsingDef1 ? CRI.CopyDefs.first : CRI.CopyDefs.second };
+  bool Changed = splitMBB(BSI);
+  // If we've split on a CR logical that is fed by a CR logical,
+  // recompute the source CR logical as it may be usable for splitting.
+  if (Changed) {
+    bool Input1CRlogical =
+      CRI.TrueDefs.first && isCRLogical(*CRI.TrueDefs.first);
+    bool Input2CRlogical =
+      CRI.TrueDefs.second && isCRLogical(*CRI.TrueDefs.second);
+    if (Input1CRlogical)
+      AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.first));
+    if (Input2CRlogical)
+      AllCRLogicalOps.push_back(createCRLogicalOpInfo(*CRI.TrueDefs.second));
+  }
+  return Changed;
+}
+
+void PPCReduceCRLogicals::collectCRLogicals() {
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      if (isCRLogical(MI)) {
+        AllCRLogicalOps.push_back(createCRLogicalOpInfo(MI));
+        TotalCRLogicals++;
+        if (AllCRLogicalOps.back().IsNullary)
+          TotalNullaryCRLogicals++;
+        else if (AllCRLogicalOps.back().IsBinary)
+          TotalBinaryCRLogicals++;
+        else
+          TotalUnaryCRLogicals++;
+      }
+    }
+  }
+}
+
+} // end annonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
+                      "PowerPC Reduce CR logical Operation", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE,
+                    "PowerPC Reduce CR logical Operation", false, false)
+
+char PPCReduceCRLogicals::ID = 0;
+FunctionPass*
+llvm::createPPCReduceCRLogicalsPass() { return new PPCReduceCRLogicals(); }
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9207165c46a6d..6b62a82ef7bf9 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -21,12 +21,15 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -36,8 +39,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
@@ -49,6 +50,9 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "PPCGenRegisterInfo.inc"
 
+STATISTIC(InflateGPRC, "Number of gprc inputs for getLargestLegalClass");
+STATISTIC(InflateGP8RC, "Number of g8rc inputs for getLargestLegalClass");
+
 static cl::opt<bool>
 EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
          cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -57,6 +61,10 @@ static cl::opt<bool>
 AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
          cl::desc("Force the use of a base pointer in every function"));
 
+static cl::opt<bool>
+EnableGPRToVecSpills("ppc-enable-gpr-to-vsr-spills", cl::Hidden, cl::init(false),
+         cl::desc("Enable spills from gpr to vsr rather than stack"));
+
 PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
                        TM.isPPC64() ? 0 : 1,
@@ -82,6 +90,8 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
   // VSX
   ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX;
   ImmToIdxMap[PPC::DFLOADf64] = PPC::LXSDX;
+  ImmToIdxMap[PPC::SPILLTOVSR_LD] = PPC::SPILLTOVSR_LDX;
+  ImmToIdxMap[PPC::SPILLTOVSR_ST] = PPC::SPILLTOVSR_STX;
   ImmToIdxMap[PPC::DFSTOREf32] = PPC::STXSSPX;
   ImmToIdxMap[PPC::DFSTOREf64] = PPC::STXSDX;
   ImmToIdxMap[PPC::LXV] = PPC::LXVX;
@@ -113,7 +123,7 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
 const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
-  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) {
+  if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
     if (Subtarget.hasVSX())
       return CSR_64_AllRegs_VSX_SaveList;
     if (Subtarget.hasAltivec())
@@ -151,7 +161,7 @@ PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
     return nullptr;
   if (!TM.isPPC64())
     return nullptr;
-  if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS)
+  if (MF->getFunction().getCallingConv() != CallingConv::CXX_FAST_TLS)
     return nullptr;
   if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
     return nullptr;
@@ -328,6 +338,18 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     // With VSX, we can inflate various sub-register classes to the full VSX
     // register set.
 
+    // For Power9 we allow the user to enable GPR to vector spills.
+    // FIXME: Currently limited to spilling GP8RC. A follow on patch will add
+    // support to spill GPRC.
+    if (TM.isELFv2ABI()) {
+      if (Subtarget.hasP9Vector() && EnableGPRToVecSpills &&
+          RC == &PPC::G8RCRegClass) {
+        InflateGP8RC++;
+        return &PPC::SPILLTOVSRRCRegClass;
+      }
+      if (RC == &PPC::GPRCRegClass && EnableGPRToVecSpills)
+        InflateGPRC++;
+    }
     if (RC == &PPC::F8RCRegClass)
       return &PPC::VSFRCRegClass;
     else if (RC == &PPC::VRRCRegClass)
@@ -879,7 +901,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Naked functions have stack size 0, although getStackSize may not reflect
   // that because we didn't call all the pieces that compute it for naked
   // functions.
-  if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::Naked)) {
     if (!(hasBasePointer(MF) && FrameIndex < 0))
       Offset += MFI.getStackSize();
   }
@@ -911,11 +933,16 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
            SReg = MF.getRegInfo().createVirtualRegister(RC);
 
   // Insert a set of rA with the full offset value before the ld, st, or add
-  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
-    .addImm(Offset >> 16);
-  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
-    .addReg(SRegHi, RegState::Kill)
-    .addImm(Offset);
+  if (isInt<16>(Offset)) 
+    BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), SReg)
+      .addImm(Offset);
+  else {
+    BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+      .addImm(Offset >> 16);
+    BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+      .addReg(SRegHi, RegState::Kill)
+      .addImm(Offset);
+  }
 
   // Convert into indexed form of the instruction:
   //
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 896cec7e4f6e8..f7807907bd640 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -305,6 +305,11 @@ def VFRC :  RegisterClass<"PPC", [f64], 64,
                                VF22, VF21, VF20)>;
 def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
 
+// Allow spilling GPR's into caller-saved VSR's.
+def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC,
+				(sequence "VF%u", 31, 20),
+				(sequence "F%u", 31, 14)))>;
+
 // Register class for single precision scalars in VSX registers
 def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
 
diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td
index a01995a629c29..b24f4fc603a15 100644
--- a/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/lib/Target/PowerPC/PPCScheduleP9.td
@@ -22,7 +22,9 @@ def P9Model : SchedMachineModel {
   // Try to make sure we have at least 10 dispatch groups in a loop.
   let LoopMicroOpBufferSize = 60;
 
-  let CompleteModel = 0;
+  let CompleteModel = 1;
+
+  let UnsupportedFeatures = [HasQPX];
 
 }
 
@@ -68,6 +70,10 @@ let SchedModel = P9Model in {
   def LS : ProcResource<4>;
   def PM : ProcResource<2>;
   def DFU : ProcResource<1>;
+  def BR : ProcResource<1> {
+    let BufferSize = 16;
+  }
+  def CY : ProcResource<1>;
 
   def TestGroup : ProcResGroup<[ALU, DP]>;
 
@@ -145,6 +151,10 @@ let SchedModel = P9Model in {
     let Latency = 6;
   }
 
+  def P9_DIV_12C : SchedWriteRes<[DIV]> {
+    let Latency = 12;
+  }
+
   def P9_DIV_16C_8 : SchedWriteRes<[DIV]> {
     let ResourceCycles = [8];
     let Latency = 16;
@@ -190,6 +200,16 @@ let SchedModel = P9Model in {
     let Latency = 24;
   }
 
+  def P9_DPO_24C_8 : SchedWriteRes<[DPO]> {
+    let ResourceCycles = [8];
+    let Latency = 24;
+  }
+
+  def P9_DPE_24C_8 : SchedWriteRes<[DPE]> {
+    let ResourceCycles = [8];
+    let Latency = 24;
+  }
+
   def P9_DP_26C_5 : SchedWriteRes<[DP]> {
     let ResourceCycles = [5];
     let Latency = 22;
@@ -205,6 +225,16 @@ let SchedModel = P9Model in {
     let Latency = 33;
   }
 
+  def P9_DPE_33C_8 : SchedWriteRes<[DPE]> {
+    let ResourceCycles = [8];
+    let Latency = 33;
+  }
+
+  def P9_DPO_33C_8 : SchedWriteRes<[DPO]> {
+    let ResourceCycles = [8];
+    let Latency = 33;
+  }
+
   def P9_DP_36C_10 : SchedWriteRes<[DP]> {
     let ResourceCycles = [10];
     let Latency = 36;
@@ -248,31 +278,61 @@ let SchedModel = P9Model in {
     let Latency = 76;
     let ResourceCycles = [62];
   }
+
+  def P9_BR_2C : SchedWriteRes<[BR]> {
+    let Latency = 2;
+  }
+
+  def P9_BR_5C : SchedWriteRes<[BR]> {
+    let Latency = 5;
+  }
+
+  def P9_CY_6C : SchedWriteRes<[CY]> {
+    let Latency = 6;
+  }
+
   // ***************** WriteSeq Definitions *****************
 
   def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
   def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
   def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
+  def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>;
   def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
   def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
   def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
   def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+  def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>;
+  def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>;
+  def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>;
 
   // ***************** Defining Itinerary Class Resources *****************
 
+  // The following itineraries are fully covered by the InstRW definitions in
+  // P9InstrResources.td so aren't listed here.
+  // IIC_FPDivD, IIC_FPDivS, IIC_FPFused, IIC_IntDivD, IIC_LdStLFDU,
+  // IIC_LdStLFDUX
+
   def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_IntSimple, IIC_IntGeneral]>;
+               [IIC_IntSimple, IIC_IntGeneral, IIC_IntRFID,
+                IIC_IntRotateD, IIC_IntRotateDI, IIC_IntTrapD,
+                IIC_SprRFI]>;
+
+  def : ItinRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_IntTrapW]>;
 
   def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
                [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
 
   def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
 
+  def : ItinRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
+                DISP_1C, DISP_1C], [IIC_VecGeneral, IIC_FPCompare]>;
+
   def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>;
+               [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI, IIC_IntMulHD]>;
 
   def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
-               [IIC_LdStLoad, IIC_LdStLD]>;
+               [IIC_LdStLoad, IIC_LdStLD, IIC_LdStLFD]>;
 
   def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
                 DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -300,12 +360,18 @@ let SchedModel = P9Model in {
   def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
                [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
 
+  def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_LdStCOPY, IIC_SprABORT, IIC_LdStPASTE, IIC_LdStDCBF,
+                IIC_LdStICBI, IIC_LdStSync, IIC_SprISYNC, IIC_SprMSGSYNC,
+                IIC_SprSLBIA, IIC_SprSLBSYNC, IIC_SprTLBSYNC]>;
+
   def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
                [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
 
   def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
                 DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
-               [IIC_LdStSTDU, IIC_LdStSTDUX]>;
+               [IIC_LdStSTDU, IIC_LdStSTDUX, IIC_LdStStoreUpd, IIC_SprSLBIEG,
+                IIC_SprTLBIA, IIC_SprTLBIE]>;
 
   def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
                 DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
@@ -315,20 +381,44 @@ let SchedModel = P9Model in {
                [IIC_BrCR, IIC_IntMTFSB0]>;
 
   def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
-                IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
-                DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>;
+                IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_SprMFCR, IIC_SprMFCRF, IIC_BrMCR, IIC_BrMCRX, IIC_IntMFFS]>;
+
+  def : ItinRW<[P9_BR_2C, DISP_1C], [IIC_BrB]>;
+  def : ItinRW<[P9_BR_5C, DISP_1C], [IIC_SprMFSPR]>;
 
   // This class should be broken down to instruction level, once some missing
   // info is obtained.
   def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
                 DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
 
-  def : ItinRW<[P9_DP_7C, IP_EXEC_1C,
-                DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>;
+  def : ItinRW<[P9_LoadAndLoadOp_8C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_SprSLBIE, IIC_SprSLBMFEE, IIC_SprSLBMFEV, IIC_SprSLBMTE,
+                IIC_SprTLBIEL]>;
+
+  // IIC_VecFP is added here although many instructions with that itinerary
+  // use very different resources. It would appear that instructions were
+  // given that itinerary rather carelessly over time. Specific instructions
+  // that use different resources are listed in various InstrRW classes.
+  def : ItinRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_FPGeneral, IIC_FPAddSub, IIC_VecFP]>;
+
+  def : ItinRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C,
+                DISP_1C, DISP_1C], [IIC_VecFPCompare]>;
+
+  def : ItinRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+               [IIC_VecPerm]>;
 
   def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
   def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
 
+  def : ItinRW<[P9_DIV_12C, IP_EXECE_1C, DISP_1C, DISP_1C],
+               [IIC_SprMFMSR, IIC_SprMFPMR, IIC_SprMFSR, IIC_SprMFTB,
+                IIC_SprMTMSR, IIC_SprMTMSRD, IIC_SprMTPMR, IIC_SprMTSR]>;
+
+  def : ItinRW<[], [IIC_SprSTOP]>;
+
   include "P9InstrResources.td"
 
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 90d11f46a384d..c351b5c04a056 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -19,9 +19,9 @@
 #include "PPCInstrInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 5f8085f4626e2..49f2699ab082e 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -25,7 +25,7 @@
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index fe092cc3b858d..491f25ca2c64a 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
@@ -31,7 +32,6 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include <cassert>
@@ -40,6 +40,10 @@
 
 using namespace llvm;
 
+
+static cl::opt<bool>
+    EnableBranchCoalescing("enable-ppc-branch-coalesce", cl::Hidden,
+                           cl::desc("enable coalescing of duplicate branches for PPC"));
 static cl::
 opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
                         cl::desc("Disable CTR loops for PPC"));
@@ -84,6 +88,10 @@ EnableMachineCombinerPass("ppc-machine-combiner",
                           cl::desc("Enable the machine combiner pass"),
                           cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+  ReduceCRLogical("ppc-reduce-cr-logicals",
+                  cl::desc("Expand eligible cr-logical binary ops to branches"),
+                  cl::init(false), cl::Hidden);
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
@@ -93,7 +101,9 @@ extern "C" void LLVMInitializePowerPCTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
   initializePPCExpandISELPass(PR);
+  initializePPCPreEmitPeepholePass(PR);
   initializePPCTLSDynamicCallPass(PR);
+  initializePPCMIPeepholePass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -208,6 +218,17 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   return Reloc::Static;
 }
 
+static CodeModel::Model getEffectiveCodeModel(const Triple &TT,
+                                              Optional<CodeModel::Model> CM,
+                                              bool JIT) {
+  if (CM)
+    return *CM;
+  if (!TT.isOSDarwin() && !JIT &&
+      (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
+    return CodeModel::Medium;
+  return CodeModel::Small;
+}
+
 // The FeatureString here is a little subtle. We are modifying the feature
 // string with what are (currently) non-function specific overrides as it goes
 // into the LLVMTargetMachine constructor and then using the stored value in the
@@ -216,10 +237,12 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    const TargetOptions &Options,
                                    Optional<Reloc::Model> RM,
-                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+                                   Optional<CodeModel::Model> CM,
+                                   CodeGenOpt::Level OL, bool JIT)
     : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
                         computeFSAdditions(FS, OL, TT), Options,
-                        getEffectiveRelocModel(TT, RM), CM, OL),
+                        getEffectiveRelocModel(TT, RM),
+                        getEffectiveCodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())),
       TargetABI(computeTargetABI(TT, Options)) {
   initAsmInfo();
@@ -365,12 +388,19 @@ bool PPCPassConfig::addInstSelector() {
 }
 
 void PPCPassConfig::addMachineSSAOptimization() {
+  // PPCBranchCoalescingPass need to be done before machine sinking
+  // since it merges empty blocks.
+  if (EnableBranchCoalescing && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCBranchCoalescingPass());
   TargetPassConfig::addMachineSSAOptimization();
   // For little endian, remove where possible the vector swap instructions
   // introduced at code generation to normalize vector element order.
   if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
       !DisableVSXSwapRemoval)
     addPass(createPPCVSXSwapRemovalPass());
+  // Reduce the number of cr-logical ops.
+  if (ReduceCRLogical && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCReduceCRLogicalsPass());
   // Target-specific peephole cleanups performed after instruction
   // selection.
   if (!DisableMIPeephole) {
@@ -412,6 +442,7 @@ void PPCPassConfig::addPreSched2() {
 }
 
 void PPCPassConfig::addPreEmitPass() {
+  addPass(createPPCPreEmitPeepholePass());
   addPass(createPPCExpandISELPass());
 
   if (getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index be705507b5347..102bf7ca59c26 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -35,14 +35,15 @@ private:
 public:
   PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                    StringRef FS, const TargetOptions &Options,
-                   Optional<Reloc::Model> RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL);
+                   Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                   CodeGenOpt::Level OL, bool JIT);
 
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
-  // The no argument getSubtargetImpl, while it exists on some targets, is
-  // deprecated and should not be used.
+  // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+  // subtargets are per-function entities based on the target-specific
+  // attributes of each function.
   const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index c8b9b2e9790b6..8343a90696d92 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
 
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 6110706b01b90..aa4073f7ea025 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -10,10 +10,10 @@
 #include "PPCTargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ppctti"
@@ -189,6 +189,17 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
+unsigned PPCTTIImpl::getUserCost(const User *U,
+                                 ArrayRef<const Value *> Operands) {
+  if (U->getType()->isVectorTy()) {
+    // Instructions that need to be split should cost more.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
+    return LT.first * BaseT::getUserCost(U, Operands);
+  }
+  
+  return BaseT::getUserCost(U, Operands);
+}
+
 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
@@ -215,9 +226,17 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   return LoopHasReductions;
 }
 
-bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
-  MaxLoadSize = 8;
-  return true;
+const PPCTTIImpl::TTI::MemCmpExpansionOptions *
+PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
+  static const auto Options = []() {
+    TTI::MemCmpExpansionOptions Options;
+    Options.LoadSizes.push_back(8);
+    Options.LoadSizes.push_back(4);
+    Options.LoadSizes.push_back(2);
+    Options.LoadSizes.push_back(1);
+    return Options;
+  }();
+  return &Options;
 }
 
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 99ca6394d1bed..b42dae4a0254c 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -21,7 +21,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
 
@@ -51,6 +51,8 @@ public:
   int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                     Type *Ty);
 
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
@@ -61,7 +63,8 @@ public:
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
-  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
+  const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
+      bool IsZeroCmp) const;
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index a57484e5abdf7..f15af790de8f5 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -20,7 +20,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -90,21 +90,21 @@ protected:
         // This pass is run after register coalescing, and so we're looking for
         // a situation like this:
         //   ...
-        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
-        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
-        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   %5 = COPY %9; VSLRC:%5,%9
+        //   %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
+        //                         implicit %rm; VSLRC:%5,%17,%16
         //   ...
-        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
-        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   %9<def,tied1> = XSMADDADP %9<tied0>, %17, %19,
+        //                         implicit %rm; VSLRC:%9,%17,%19
         //   ...
         // Where we can eliminate the copy by changing from the A-type to the
         // M-type instruction. Specifically, for this example, this means:
-        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
-        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   %5<def,tied1> = XSMADDADP %5<tied0>, %17, %16,
+        //                         implicit %rm; VSLRC:%5,%17,%16
         // is replaced by:
-        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
-        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
-        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %16<def,tied1> = XSMADDMDP %16<tied0>, %18, %9,
+        //                         implicit %rm; VSLRC:%16,%18,%9
+        // and we remove: %5 = COPY %9; VSLRC:%5,%9
 
         SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
 
@@ -150,13 +150,13 @@ protected:
         // walking the MIs we may as well test liveness here.
         //
         // FIXME: There is a case that occurs in practice, like this:
-        //   %vreg9<def> = COPY %F1; VSSRC:%vreg9
+        //   %9 = COPY %f1; VSSRC:%9
         //   ...
-        //   %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
-        //   %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
-        //   %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
-        //   %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
-        //   %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+        //   %6 = COPY %9; VSSRC:%6,%9
+        //   %7 = COPY %9; VSSRC:%7,%9
+        //   %9<def,tied1> = XSMADDASP %9<tied0>, %1, %4; VSSRC:
+        //   %6<def,tied1> = XSMADDASP %6<tied0>, %1, %2; VSSRC:
+        //   %7<def,tied1> = XSMADDASP %7<tied0>, %1, %3; VSSRC:
         // which prevents an otherwise-profitable transformation.
         bool OtherUsers = false, KillsAddendSrc = false;
         for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
@@ -177,11 +177,11 @@ protected:
 
 
         // The transformation doesn't work well with things like:
-        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
-        // unless vreg11 is also a kill, so skip when it is not,
+        //    %5 = A-form-op %5, %11, %5;
+        // unless %11 is also a kill, so skip when it is not,
         // and check operand 3 to see it is also a kill to handle the case:
-        //   %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
-        // where vreg5 and vreg11 are both kills. This case would be skipped
+        //   %5 = A-form-op %5, %5, %11;
+        // where %5 and %11 are both kills. This case would be skipped
         // otherwise.
         unsigned OldFMAReg = MI.getOperand(0).getReg();
 
@@ -343,7 +343,7 @@ protected:
 
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
-      if (skipFunction(*MF.getFunction()))
+      if (skipFunction(MF.getFunction()))
         return false;
 
       // If we don't have VSX then go ahead and return without doing
diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 7d34efd4af3e0..8a5fb9fdaef11 100644
--- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -191,7 +191,7 @@ private:
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
-    if (skipFunction(*MF.getFunction()))
+    if (skipFunction(MF.getFunction()))
       return false;
 
     // If we don't have VSX on the subtarget, don't do anything.
@@ -353,6 +353,8 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
         break;
       case PPC::LXSDX:
       case PPC::LXSSPX:
+      case PPC::XFLOADf64:
+      case PPC::XFLOADf32:
         // A load of a floating-point value into the high-order half of
         // a vector register is safe, provided that we introduce a swap
         // following the load, which will be done by the SUBREG_TO_REG
@@ -964,7 +966,7 @@ LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
 
     dbgs() << format("%6d", ID);
     dbgs() << format("%6d", EC->getLeaderValue(ID));
-    dbgs() << format(" BB#%3d", MI->getParent()->getNumber());
+    dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
     dbgs() << format("  %14s  ", TII->getName(MI->getOpcode()).str().c_str());
 
     if (SwapVector[EntryIdx].IsLoad)
diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
index bc09d5f8a7e8e..b4bf635dc2c75 100644
--- a/lib/Target/PowerPC/README.txt
+++ b/lib/Target/PowerPC/README.txt
@@ -256,7 +256,7 @@ _clamp0g:
         cmpwi cr0, r3, 0
         li r2, 0
         blt cr0, LBB1_2
-; BB#1:                                                     ; %entry
+; %bb.1:                                                    ; %entry
         mr r2, r3
 LBB1_2:                                                     ; %entry
         mr r3, r2
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
index f70ebd82bd5c9..c38e019231611 100644
--- a/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -233,7 +233,7 @@ declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1
 
 
 Produces the following code with -mtriple=powerpc64-unknown-linux-gnu:
-# BB#0:                                 # %entry
+# %bb.0:                                # %entry
     addis 3, 2, .LCPI0_0@toc@ha
     addis 4, 2, .LCPI0_1@toc@ha
     addi 3, 3, .LCPI0_0@toc@l
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index a637dd11f8105..979595264472f 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -27,11 +27,11 @@ Target &llvm::getThePPC64LETarget() {
 
 extern "C" void LLVMInitializePowerPCTargetInfo() {
   RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
-                                                 "PowerPC 32");
+                                                 "PowerPC 32", "PPC");
 
   RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
-                                                   "PowerPC 64");
+                                                   "PowerPC 64", "PPC");
 
   RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> Z(
-      getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE");
+      getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE", "PPC");
 }
diff --git a/lib/Target/PowerPC/p9-instrs.txt b/lib/Target/PowerPC/p9-instrs.txt
deleted file mode 100644
index a70582aca3989..0000000000000
--- a/lib/Target/PowerPC/p9-instrs.txt
+++ /dev/null
@@ -1,442 +0,0 @@
-Content:
-========
-. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions) 
-. Done (Total 155 Instructions: 101 VSX, 54 Altivec)
-
-//------------------------------------------------------------------------------
-//. Remaining Instructions
-//------------------------------------------------------------------------------
-GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html
-
-// Add PC Immediate Shifted  DX-form p69
-[PO RT d1 d0 XO d2]         addpcis     RT,D
-                            subpcis Rx,value = addpcis Rx,-value
-
-// 6.17.2 Decimal Integer Format Conversion Instructions
-
-// Decimal Convert From National VX-form p352
-[PO VRT EO VRB 1 PS XO]     bcdcfn.     VRT,VRB,PS
-
-// Decimal Convert From Zoned VX-form p353
-[PO VRT EO VRB 1 PS XO]     bcdcfz.     VRT,VRB,PS
-
-// Decimal Convert To National VX-form p354
-[PO VRT EO VRB 1 / XO]      bcdctn.     VRT,VRB
-
-// Decimal Convert To Zoned VX-form p355
-[PO VRT EO VRB 1 PS XO]     bcdctz.     VRT,VRB,PS
-
-// Decimal Convert From Signed Quadword VX-form p356
-[PO VRT EO VRB 1 PS XO]     bcdcfsq.    VRT,VRB,PS
-
-// Decimal Convert To Signed Quadword VX-form p356
-[PO VRT EO VRB 1 / XO]      bcdctsq.    VRT,VRB
-
-// 6.17.3 Decimal Integer Sign Manipulation Instructions
-
-// Decimal Copy Sign VX-form p358
-[PO VRT VRA VRB XO]         bcdcpsgn.   VRT,VRA,VRB
-
-// Decimal Set Sign VX-form p358
-[PO VRT EO VRB 1 PS XO]     bcdsetsgn.  VRT,VRB,PS
-
-// Decimal Shift VX-form p359
-[PO VRT VRA VRB 1 PS XO]    bcds.       VRT,VRA,VRB,PS
-
-// Decimal Unsigned Shift VX-form p360
-[PO VRT VRA VRB 1 / XO]     bcdus.      VRT,VRA,VRB
-
-// Decimal Shift and Round VX-form p361
-[PO VRT VRA VRB 1 PS XO]    bcdsr.      VRT,VRA,VRB,PS
-
-// 6.17.5 Decimal Integer Truncate Instructions
-
-// Decimal Truncate VX-form p362
-[PO VRT VRA VRB 1 PS XO]    bcdtrunc.   VRT,VRA,VRB,PS
-
-// Decimal Unsigned Truncate VX-form p363
-[PO VRT VRA VRB 1 / XO]     bcdutrunc.  VRT,VRA,VRB
-
-// 3.3.10.1 Character-Type Compare Instructions
-
-// Compare Ranged Byte X-form p87
-[PO BF / L RA RB XO /]      cmprb       BF,L,RA,RB
-
-// Compare Equal Byte X-form p88
-[PO BF // RA RB XO /]       cmpeqb      BF,RA,RB
-
-// 3.3.13 Fixed-Point Logical Instructions
-
-// Count Trailing Zeros Word X-form p95
-[PO RS RA /// XO Rc]        cnttzw(.)   RA,RS
-
-// 3.3.13.1 64-bit Fixed-Point Logical Instructions
-
-// Count Trailing Zeros Doubleword  X-form p98
-[PO RS RA /// XO Rc]        cnttzd(.)   RA,RS
-
-// 4.4 Copy-Paste Facility
-
-// Copy X-form p858
-[PO /// L RA RB XO /]       copy        RA,RB,L
-                            copy_first = copy RA, RB, 1
-// CP_Abort p860
-[PO /// /// /// XO /]       cp_abort
-
-// Paste p859
-[PO /// L RA RB XO Rc]      paste(.)    RA,RB,L
-                            paste_last = paste RA,RB,1
-
-// 3.3.9 Fixed-Point Arithmetic Instructions
-
-// Deliver A Random Number X-form p79
-[PO RT /// L /// XO /]      darn        RT,L
-
-// Multiply-Add High Doubleword VA-form p81
-[PO RT RA RB RC XO]         maddhd      RT,RA.RB,RC
-
-// Multiply-Add High Doubleword Unsigned VA-form  p81
-[PO RT RA RB RC XO]         maddhdu     RT,RA.RB,RC
-
-// Multiply-Add Low Doubleword VA-form p81
-[PO RT RA RB RC XO]         maddld      RT,RA.RB,RC
-
-// Modulo Signed Word X-form p76
-[PO RT RA RB XO /]          modsw       RT,RA,RB
-
-// Modulo Unsigned Word X-form p76
-[PO RT RA RB XO /]          moduw       RT,RA,RB 
-
-// Modulo Signed Doubleword X-form p84
-[PO RT RA RB XO /]          modsd       RT,RA,RB
-
-// Modulo Unsigned Doubleword X-form p84
-[PO RT RA RB XO /]          modud       RT,RA,RB
-
-
-// DFP Test Significance Immediate [Quad] X-form p204
-[PO BF / UIM FRB XO /]      dtstsfi     BF,UIM,FRB
-[PO BF / UIM FRBp XO /]     dtstsfiq    BF,UIM,FRBp
-
-// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions
-
-// Extend-Sign Word and Shift Left Immediate XS-form p109
-[PO RS RA sh XO sh Rc]      extswsli(.) RA,RS,SH
-
-// 4.5.1 Load Atomic
-
-// Load Word Atomic   X-form p864
-[PO RT RA FC XO /]          lwat        RT,RA,FC
-
-// Load Doubleword Atomic X-form p864
-[PO RT RA FC XO /]          ldat        RT,RA,FC
-
-// 4.5.2 Store Atomic
-
-// Store Word Atomic   X-form p866
-[PO RS RA FC XO /]          stwat       RS,RA,FC
-
-// Store Doubleword Atomic   X-form p866
-[PO RS RA FC XO /]          stdat       RS,RA,FC
-
-// 3.3.2.1 64-bit Fixed-Point Load Instructions 
-
-// Load Doubleword Monitored Indexed X-form p54
-[PO RT RA RB XO /]          ldmx        RT,RA,RB
-
-// 3.3.16 Move To/From Vector-Scalar Register Instructions
-
-// Move From VSR Lower Doubleword XX1-form p111
-[PO S RA /// XO SX]         mfvsrld     RA,XS
-
-// Move To VSR Double Doubleword XX1-form p114
-[PO T RA RB XO TX]          mtvsrdd     XT,RA,RB
-
-// Move To VSR Word & Splat XX1-form p115
-[PO T RA /// XO TX]         mtvsrws     XT,RA
-
-// Move to CR from XER Extended X-form p119
-[PO BF // /// /// XO /]     mcrxrx      BF
-
-// Set Boolean X-form p121
-[PO RT BFA // /// XO /]     setb        RT,BFA
-
-// Message Synchronize X-form p1126
-[PO /// /// /// XO /]       msgsync
-
-// SLB Invalidate Entry Global  X-form p1026
-[PO RS /// RB XO /]         slbieg      RS,RB 
-
-// SLB Synchronize  X-form p1031
-[PO /// /// /// XO /]       slbsync
-
-// 3.3.2.1 Power-Saving Mode Instruction
-
-// stop    XL-form p957
-[PO /// /// /// XO /]       stop
-
-// 4.6.4 Wait Instruction
-// Wait X-form p880
-[PO /// WC /// /// XO /]    wait
-
-// Unknow Instructions:
-urfid
-- gcc's implementation:
-    {"urfid",	XL(19,306),	0xffffffff,  POWER9,	PPCNONE,	{0}},
-    (4c 00 02 64|64 02 00 4c) 	urfid
-
-rmieg
-- gcc's implementation: 
-    {"rmieg",	X(31,882),	XRTRA_MASK,  POWER9,	PPCNONE,	{RB}},
-    (7c 00 f6 e4|e4 f6 00 7c) 	rmieg   r30
-
-//------------------------------------------------------------------------------
-//. Done:
-//------------------------------------------------------------------------------
-
-//======================================
-"vsx instructions"
-
-//--------------------------------------
-"7.6.1.2.1 VSX Scalar Move Instructions"
-// VSX Scalar Quad-Precision Move Instructions
-
-// VSX Scalar Copy Sign Quad-Precision X-form p.553
-[PO VRT VRA VRB XO /] xscpsgnqp
-
-// VSX Scalar Absolute Quad-Precision X-form 531
-// VSX Scalar Negate Quad-Precision X-form 627
-// VSX Scalar Negative Absolute Quad-Precision X-form 626
-[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp
-
-//--------------------------------------
-"7.6.1.3 VSX Floating-Point Arithmetic Instructions"
-
-// VSX Scalar Quad-Precision Elementary Arithmetic
-
-// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539
-// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584
-// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622
-[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo
-
-// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662
-// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667
-                       xssubqp xssubqpo
-
-[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo
-
-// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions
-
-// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596
-// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617
-// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636
-// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd]
-// X-form 645
-[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo 
-                       xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo
-
-22
-//--------------------------------------
-"7.6.1.4 VSX Floating-Point Compare Instructions"
-
-// VSX Scalar Quad-Precision Compare Instructions
-
-// VSX Scalar Compare Ordered Quad-Precision X-form 549
-// VSX Scalar Compare Unordered Quad-Precision X-form 552
-[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp 
-
-"7.6.1.8 VSX Scalar Floating-Point Support Instructions"
-// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542
-[PO BF // A B XO AX BX /] xscmpexpdp 
-[PO BF // VRA VRB XO /] xscmpexpqp
-
-// VSX Scalar Compare DP, XX3-form, p.543 544 545
-// VSX Scalar Compare Equal Double-Precision, 
-[PO T A B XO AX BX TX]  xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
-
-// VSX Vector Compare Not Equal Double-Precision XX3-form 691
-[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
-
-//--------------------------------------
-"7.6.1.5 VSX FP-FP Conversion Instructions"
-// VSX Scalar Quad-Precision Floating-Point Conversion Instructions
-
-// VSX Scalar round & Convert Quad-Precision format to Double-Precision format
-// [using round to Odd] X-form 567
-[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO])
-[PO VRT XO VRB XO /] xscvdpqp
-
-// VSX Scalar Quad-Precision Convert to Integer Instructions
-
-// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format
-// 568 570 572 574
-[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
-576 = 580            xscvsdqp xscvudqp
-
-"7.6.1.7 VSX Round to Floating-Point Integer Instructions"
-// VSX Scalar round & Convert Double-Precision format to Half-Precision format
-// XX2-form 554 566
-[PO T XO B XO BX TX] xscvdphp xscvhpdp
-
-// VSX Vector Convert Half-Precision format to Single-Precision format
-// XX2-form 703 705
-[PO T XO B XO BX TX] xvcvhpsp xvcvsphp
-
-// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654
-[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix
-
-// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656
-[PO VRT /// R VRB RMC XO /] xsrqpxp 
-def XSRQPXP : Z23Form_1<63, 37,
-                        (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC),
-                        "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>;
-
-27~28
-//--------------------------------------
-// VSX Scalar Insert Exponent Double-Precision X-form 588
-// VSX Scalar Insert Exponent Quad-Precision X-form 589
-[PO VT rA rB XO /]  xsiexpdp 
-[PO VRT VRA VRB XO /]  xsiexpqp
-
-// VSX Vector Insert Exponent Double-Precision XX3-form 722
-[PO T A B XO AX BX TX] xviexpdp xviexpsp
-
-// VSX Vector Extract Unsigned Word XX2-form 788
-// VSX Vector Insert Word XX2-form
-[PO T / UIM B XO BX TX] xxextractuw xxinsertw
-
-// VSX Scalar Extract Exponent Double-Precision XX2-form 676
-[PO BF DCMX B XO BX /]  
-[PO T XO B XO BX /] xsxexpdp xsxsigdp
-// X-form
-[PO VRT XO VRB XO /] xsxexpqp xsxsigqp
-
-// VSX Vector Extract Exponent Double-Precision XX2-form 784
-[PO T XO B XO BX TX] xvxexpdp xvxexpsp
-
-// VSX Vector Extract Significand Double-Precision XX2-form 785
-[PO T XO B XO BX TX] xvxsigdp xvxsigsp
-
-//--------------------------------------
-// VSX Scalar Test Data Class Double-Precision XX2-form p673
-// VSX Scalar Test Data Class Quad-Precision X-form 674
-// VSX Scalar Test Data Class Single-Precision XX2-form 675
-[PO BF DCMX B XO BX /]  xststdcdp xststdcsp
-[PO BF DCMX VRB XO /]   xststdcqp 
-
-// VSX Vector Test Data Class Double-Precision XX2-form 782 783
-[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp 
-
-//--------------------------------------
-// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609
-[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp
-
-//--------------------------------------
-// VSX Vector Byte-Reverse Doubleword XX2-form 786 787
-[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw
-
-// VSX Vector Permute XX3-form 794
-[PO T A B XO AX BX TX] xxperm xxpermr
-
-// VSX Vector Splat Immediate Byte 796 x-form
-[PO T EO IMM8 XO TX] xxspltib   <= sign or unsigned?
-
-30
-//--------------------------------------
-// Load VSX Vector DQ-form 511
-[PO T RA DQ TX XO] lxv 
-
-// Store VSX Vector DQ-form 526
-[PO S RA DQ SX XO] stxv
-
-// Load VSX Scalar Doubleword DS-form 499
-// Load VSX Scalar Single DS-form 504
-[PO VRT RA DS XO] lxsd lxssp 
-
-// Store VSX Scalar Doubleword DS-form 517
-// Store VSX Scalar Single DS-form 520
-[PO VRT RA DS XO] stxsd stxssp
-
-
-// Load VSX Vector Indexed X-form 511
-// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501
-// Load VSX Vector Byte*16 Indexed X-form 506
-// Load VSX Vector with Length X-form 508
-// Load VSX Vector Left-justified with Length X-form 510
-// Load VSX Vector Halfword*8 Indexed X-form 514
-// Load VSX Vector Word & Splat Indexed X-form 516
-[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx
-
-// Store VSX Scalar as Integer Byte Indexed X-form 518
-// Store VSX Scalar as Integer Halfword Indexed X-form 518
-// Store VSX Vector Byte*16 Indexed X-form 522
-// Store VSX Vector Halfword*8 Indexed X-form 524
-// Store VSX Vector with Length X-form 526
-// Store VSX Vector Left-justified with Length X-form 528
-// Store VSX Vector Indexed X-form 529
-[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx
-
-21
-
-//--------------------------------------
-". vector instructions"
-
-[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec)
-[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html
-
-//--------------------------------------
-New patch:
-// vector bit, p.367, 6.16 Vector Bit Permute Instruction
-[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq)
-
-// vector permute, p.280
-[PO VRT VRA VRB VRC XO] vpermr
-
-// vector rotate left, p.341
-[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi
-
-// vector shift, p.285
-[PO VRT VRA VRB XO] vslv vsrv
-
-// vector multiply-by-10, p.375
-[PO VRT VRA /// XO] vmul10cuq vmul10uq
-[PO VRT VRA VRB XO] vmul10ecuq vmul10euq 
-
-12
-//--------------------------------------
-http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd
-// vector count leading/trailing zero
-. new vx-form: p.31, 1.6.14 VX-FORM
-[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363)
-
-// Vector Count Trailing Zeros Instructions, 362
-[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64)
-
-// vector extend sign (p.314)
-[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
-
-// vector negate, p.313
-[PO VRT EO VRB XO] vnegd vnegw
-
-// vector parity, p.335
-[PO VRT EO VRB XO] vprtybd vprtybq vprtybw
-
-16
-//--------------------------------------
-// vector compare, p.330
-[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew.
-                       vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw.
-12
-//--------------------------------------
-http://reviews.llvm.org/D15917 + insert
-// vector extract (p.287) ref: vspltb (v2.07, p.227)
-// vector insert, p.288
-[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw
-
-// Vector Extract Unsigned
-[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd
-
-// p.364: Vector Extract Unsigned Left/Right-Indexed
-[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx 
-
-14
-- 
cgit v1.3