1 files changed, 292 insertions, 143 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c8f1daf26de9..05d2dd000162 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -26,15 +26,40 @@ using namespace llvm;
 namespace {
 
 class SIShrinkInstructions : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const GCNSubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
 public:
   static char ID;
 
-  void shrinkMIMG(MachineInstr &MI);
-
 public:
   SIShrinkInstructions() : MachineFunctionPass(ID) {
   }
 
+  bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
+  bool isKImmOperand(const MachineOperand &Src) const;
+  bool isKUImmOperand(const MachineOperand &Src) const;
+  bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
+  bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
+  void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
+  void shrinkScalarCompare(MachineInstr &MI) const;
+  void shrinkMIMG(MachineInstr &MI) const;
+  void shrinkMadFma(MachineInstr &MI) const;
+  bool shrinkScalarLogicOp(MachineInstr &MI) const;
+  bool tryReplaceDeadSDST(MachineInstr &MI) const;
+  bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+                     Register Reg, unsigned SubReg) const;
+  bool instReadsReg(const MachineInstr *MI, unsigned Reg,
+                    unsigned SubReg) const;
+  bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
+                       unsigned SubReg) const;
+  TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
+                                                   unsigned I) const;
+  void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
+  MachineInstr *matchSwap(MachineInstr &MovT) const;
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override { return "SI Shrink Instructions"; }
@@ -59,8 +84,8 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
 /// This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
-static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
-                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
+bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
+                                          bool TryToCommute) const {
   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -69,8 +94,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg()) {
     Register Reg = Src0.getReg();
-    if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
-      MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+    if (Reg.isVirtual()) {
+      MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
       if (Def && Def->isMoveImmediate()) {
         MachineOperand &MovSrc = Def->getOperand(1);
         bool ConstantFolded = false;
@@ -91,8 +116,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
         }
 
         if (ConstantFolded) {
-          assert(MRI.use_empty(Reg));
-          Def->eraseFromParent();
+          if (MRI->use_nodbg_empty(Reg))
+            Def->eraseFromParent();
           ++NumLiteralConstantsFolded;
           return true;
         }
@@ -103,7 +128,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   // We have failed to fold src0, so commute the instruction and try again.
   if (TryToCommute && MI.isCommutable()) {
     if (TII->commuteInstruction(MI)) {
-      if (foldImmediates(MI, TII, MRI, false))
+      if (foldImmediates(MI, false))
         return true;
 
       // Commute back.
@@ -114,21 +139,20 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   return false;
 }
 
-static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
   return isInt<16>(Src.getImm()) &&
     !TII->isInlineConstant(*Src.getParent(),
                            Src.getParent()->getOperandNo(&Src));
 }
 
-static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
   return isUInt<16>(Src.getImm()) &&
     !TII->isInlineConstant(*Src.getParent(),
                            Src.getParent()->getOperandNo(&Src));
 }
 
-static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
-                                 const MachineOperand &Src,
-                                 bool &IsUnsigned) {
+bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
+                                                bool &IsUnsigned) const {
   if (isInt<16>(Src.getImm())) {
     IsUnsigned = false;
     return !TII->isInlineConstant(Src);
@@ -144,9 +168,8 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
 
 /// \returns true if the constant in \p Src should be replaced with a bitreverse
 /// of an inline immediate.
-static bool isReverseInlineImm(const SIInstrInfo *TII,
-                               const MachineOperand &Src,
-                               int32_t &ReverseImm) {
+bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
+                                              int32_t &ReverseImm) const {
   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
     return false;
 
@@ -156,8 +179,9 @@ static bool isReverseInlineImm(const SIInstrInfo *TII,
 
 /// Copy implicit register operands from specified instruction to this
 /// instruction that are not part of the instruction definition.
-static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
-                                 const MachineInstr &MI) {
+void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
+                                                MachineInstr &MI) const {
+  MachineFunction &MF = *MI.getMF();
   for (unsigned i = MI.getDesc().getNumOperands() +
          MI.getDesc().getNumImplicitUses() +
          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -168,7 +192,7 @@ static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
   }
 }
 
-static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
   // get constants on the RHS.
   if (!MI.getOperand(0).isReg())
@@ -191,7 +215,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   // and initially selected to the unsigned versions.
   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
     bool HasUImm;
-    if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+    if (isKImmOrKUImmOperand(Src1, HasUImm)) {
       if (!HasUImm) {
         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
@@ -205,22 +229,30 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
 
   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 
-  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
-      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
+      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
     MI.setDesc(NewDesc);
   }
 }
 
 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
-void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
-  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+  if (!Info)
     return;
 
-  MachineFunction *MF = MI.getParent()->getParent();
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  uint8_t NewEncoding;
+  switch (Info->MIMGEncoding) {
+  case AMDGPU::MIMGEncGfx10NSA:
+    NewEncoding = AMDGPU::MIMGEncGfx10Default;
+    break;
+  case AMDGPU::MIMGEncGfx11NSA:
+    NewEncoding = AMDGPU::MIMGEncGfx11Default;
+    break;
+  default:
+    return;
+  }
+
   int VAddr0Idx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
   unsigned NewAddrDwords = Info->VAddrDwords;
@@ -246,16 +278,23 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
   }
 
   unsigned VgprBase = 0;
+  unsigned NextVgpr = 0;
   bool IsUndef = true;
   bool IsKill = NewAddrDwords == Info->VAddrDwords;
-  for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
-    const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
-    unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+  for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
+    const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
+    unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
+    unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
+    assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
 
-    if (i == 0) {
+    if (Idx == 0) {
       VgprBase = Vgpr;
-    } else if (VgprBase + i != Vgpr)
+      NextVgpr = Vgpr + Dwords;
+    } else if (Vgpr == NextVgpr) {
+      NextVgpr = Vgpr + Dwords;
+    } else {
       return;
+    }
 
     if (!Op.isUndef())
       IsUndef = false;
@@ -288,21 +327,108 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
     }
   }
 
-  unsigned NewOpcode =
-      AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
-                            Info->VDataDwords, NewAddrDwords);
+  unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
+                                             Info->VDataDwords, NewAddrDwords);
   MI.setDesc(TII->get(NewOpcode));
   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
 
-  for (unsigned i = 1; i < Info->VAddrDwords; ++i)
-    MI.RemoveOperand(VAddr0Idx + 1);
+  for (int i = 1; i < Info->VAddrOperands; ++i)
+    MI.removeOperand(VAddr0Idx + 1);
 
   if (ToUntie >= 0) {
     MI.tieOperands(
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
-        ToUntie - (Info->VAddrDwords - 1));
+        ToUntie - (Info->VAddrOperands - 1));
+  }
+}
+
+// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
+void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
+  if (!ST->hasVOP3Literal())
+    return;
+
+  if (TII->hasAnyModifiersSet(MI))
+    return;
+
+  const unsigned Opcode = MI.getOpcode();
+  MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
+
+  bool Swap;
+
+  // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
+  if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
+    if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
+      Swap = false;
+    else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADAK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAAK_F32;
+      break;
+    case AMDGPU::V_MAD_F16_e64:
+      NewOpcode = AMDGPU::V_MADAK_F16;
+      break;
+    case AMDGPU::V_FMA_F16_e64:
+      NewOpcode = AMDGPU::V_FMAAK_F16;
+      break;
+    }
+  }
+
+  // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
+  if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
+    if (Src1.isImm() && !TII->isInlineConstant(Src1))
+      Swap = false;
+    else if (Src0.isImm() && !TII->isInlineConstant(Src0))
+      Swap = true;
+    else
+      return;
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected mad/fma opcode!");
+    case AMDGPU::V_MAD_F32_e64:
+      NewOpcode = AMDGPU::V_MADMK_F32;
+      break;
+    case AMDGPU::V_FMA_F32_e64:
+      NewOpcode = AMDGPU::V_FMAMK_F32;
+      break;
+    case AMDGPU::V_MAD_F16_e64:
+      NewOpcode = AMDGPU::V_MADMK_F16;
+      break;
+    case AMDGPU::V_FMA_F16_e64:
+      NewOpcode = AMDGPU::V_FMAMK_F16;
+      break;
+    }
+  }
+
+  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+    return;
+
+  if (Swap) {
+    // Swap Src0 and Src1 by building a new instruction.
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
+            MI.getOperand(0).getReg())
+        .add(Src1)
+        .add(Src0)
+        .add(Src2)
+        .setMIFlags(MI.getFlags());
+    MI.eraseFromParent();
+  } else {
+    TII->removeModOperands(MI);
+    MI.setDesc(TII->get(NewOpcode));
   }
 }
 
@@ -311,10 +437,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
 /// XNOR (as a ^ b == ~(a ^ ~b)).
 /// \returns true if the caller should continue the machine function iterator
-static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
-                                MachineRegisterInfo &MRI,
-                                const SIInstrInfo *TII,
-                                MachineInstr &MI) {
+bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   const MachineOperand *Dest = &MI.getOperand(0);
   MachineOperand *Src0 = &MI.getOperand(1);
@@ -323,7 +446,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
   MachineOperand *SrcImm = Src1;
 
   if (!SrcImm->isImm() ||
-      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
+      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
     return false;
 
   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
@@ -333,7 +456,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     if (isPowerOf2_32(~Imm)) {
       NewImm = countTrailingOnes(Imm);
       Opc = AMDGPU::S_BITSET0_B32;
-    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_ANDN2_B32;
     }
@@ -341,12 +464,12 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     if (isPowerOf2_32(Imm)) {
       NewImm = countTrailingZeros(Imm);
       Opc = AMDGPU::S_BITSET1_B32;
-    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_ORN2_B32;
     }
   } else if (Opc == AMDGPU::S_XOR_B32) {
-    if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+    if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
       NewImm = ~Imm;
       Opc = AMDGPU::S_XNOR_B32;
     }
@@ -354,16 +477,10 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
     llvm_unreachable("unexpected opcode");
   }
 
-  if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
-      SrcImm == Src0) {
-    if (!TII->commuteInstruction(MI, false, 1, 2))
-      NewImm = 0;
-  }
-
   if (NewImm != 0) {
     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
-      MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
-      MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+      MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+      MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
       return true;
     }
 
@@ -390,19 +507,19 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
 
 // This is the same as MachineInstr::readsRegister/modifiesRegister except
 // it takes subregs into account.
-static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
-                          Register Reg, unsigned SubReg,
-                          const SIRegisterInfo &TRI) {
+bool SIShrinkInstructions::instAccessReg(
+    iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
+    unsigned SubReg) const {
   for (const MachineOperand &MO : R) {
     if (!MO.isReg())
       continue;
 
     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
-      if (TRI.regsOverlap(Reg, MO.getReg()))
+      if (TRI->regsOverlap(Reg, MO.getReg()))
         return true;
     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
-      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
-                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
+      LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
+                            TRI->getSubRegIndexLaneMask(MO.getSubReg());
       if (Overlap.any())
         return true;
     }
@@ -410,33 +527,31 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
   return false;
 }
 
-static bool instReadsReg(const MachineInstr *MI,
-                         unsigned Reg, unsigned SubReg,
-                         const SIRegisterInfo &TRI) {
-  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
+                                        unsigned SubReg) const {
+  return instAccessReg(MI->uses(), Reg, SubReg);
 }
 
-static bool instModifiesReg(const MachineInstr *MI,
-                            unsigned Reg, unsigned SubReg,
-                            const SIRegisterInfo &TRI) {
-  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
+                                           unsigned SubReg) const {
+  return instAccessReg(MI->defs(), Reg, SubReg);
 }
 
-static TargetInstrInfo::RegSubRegPair
-getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
-                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
-  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+TargetInstrInfo::RegSubRegPair
+SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
+                                        unsigned I) const {
+  if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
     if (Reg.isPhysical()) {
-      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+      Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
     } else {
-      Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
+      Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
     }
   }
   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
 }
 
-static void dropInstructionKeepingImpDefs(MachineInstr &MI,
-                                          const SIInstrInfo *TII) {
+void SIShrinkInstructions::dropInstructionKeepingImpDefs(
+    MachineInstr &MI) const {
   for (unsigned i = MI.getDesc().getNumOperands() +
          MI.getDesc().getNumImplicitUses() +
          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
@@ -464,14 +579,13 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI,
 // Returns next valid instruction pointer if was able to create v_swap_b32.
 //
 // This shall not be done too early not to prevent possible folding which may
-// remove matched moves, and this should prefereably be done before RA to
+// remove matched moves, and this should preferably be done before RA to
 // release saved registers and also possibly after RA which can insert copies
 // too.
 //
-// This is really just a generic peephole that is not a canocical shrinking,
+// This is really just a generic peephole that is not a canonical shrinking,
 // although requirements match the pass placement and it reduces code size too.
-static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
-                               const SIInstrInfo *TII) {
+MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
          MovT.getOpcode() == AMDGPU::COPY);
 
@@ -486,8 +600,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
 
   unsigned Size = TII->getOpSize(MovT, 0) / 4;
 
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  if (!TRI.isVGPR(MRI, X))
+  if (!TRI->isVGPR(*MRI, X))
     return nullptr;
 
   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
@@ -501,7 +614,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
 
     MachineInstr *MovY = &*Iter;
-    KilledT = MovY->killsRegister(T, &TRI);
+    KilledT = MovY->killsRegister(T, TRI);
 
     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
          MovY->getOpcode() != AMDGPU::COPY) ||
@@ -514,21 +627,20 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
     Register Y = MovY->getOperand(0).getReg();
     unsigned Ysub = MovY->getOperand(0).getSubReg();
 
-    if (!TRI.isVGPR(MRI, Y))
+    if (!TRI->isVGPR(*MRI, Y))
       continue;
 
     MachineInstr *MovX = nullptr;
     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
          I != IY; ++I) {
-      if (instReadsReg(&*I, X, Xsub, TRI)    ||
-          instModifiesReg(&*I, Y, Ysub, TRI) ||
-          instModifiesReg(&*I, T, Tsub, TRI) ||
-          (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+      if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
+          instModifiesReg(&*I, T, Tsub) ||
+          (MovX && instModifiesReg(&*I, X, Xsub))) {
         MovX = nullptr;
         break;
       }
-      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
-        if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+      if (!instReadsReg(&*I, Y, Ysub)) {
+        if (!MovX && instModifiesReg(&*I, X, Xsub)) {
           MovX = nullptr;
           break;
         }
@@ -559,8 +671,8 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
 
     for (unsigned I = 0; I < Size; ++I) {
       TargetInstrInfo::RegSubRegPair X1, Y1;
-      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
-      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+      X1 = getSubRegForIndex(X, Xsub, I);
+      Y1 = getSubRegForIndex(Y, Ysub, I);
       MachineBasicBlock &MBB = *MovT.getParent();
       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
                          TII->get(AMDGPU::V_SWAP_B32))
@@ -570,23 +682,23 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
         // Drop implicit EXEC.
-        MIB->RemoveOperand(MIB->getNumExplicitOperands());
+        MIB->removeOperand(MIB->getNumExplicitOperands());
         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
       }
     }
     MovX->eraseFromParent();
-    dropInstructionKeepingImpDefs(*MovY, TII);
+    dropInstructionKeepingImpDefs(*MovY);
     MachineInstr *Next = &*std::next(MovT.getIterator());
 
-    if (T.isVirtual() && MRI.use_nodbg_empty(T)) {
-      dropInstructionKeepingImpDefs(MovT, TII);
+    if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
+      dropInstructionKeepingImpDefs(MovT);
     } else {
       Xop.setIsKill(false);
       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
         unsigned OpNo = MovT.getNumExplicitOperands() + I;
         const MachineOperand &Op = MovT.getOperand(OpNo);
-        if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
-          MovT.RemoveOperand(OpNo);
+        if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
+          MovT.removeOperand(OpNo);
       }
     }
 
@@ -596,14 +708,32 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
   return nullptr;
 }
 
+// If an instruction has dead sdst replace it with NULL register on gfx1030+
+bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
+  if (!ST->hasGFX10_3Insts())
+    return false;
+
+  MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  if (!Op)
+    return false;
+  Register SDstReg = Op->getReg();
+  if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
+    return false;
+
+  Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
+  return true;
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+  MRI = &MF.getRegInfo();
+  ST = &MF.getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 
   std::vector<unsigned> I1Defs;
 
@@ -628,7 +758,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &Src = MI.getOperand(1);
         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
           int32_t ReverseImm;
-          if (isReverseInlineImm(TII, Src, ReverseImm)) {
+          if (isReverseInlineImm(Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
             Src.setImm(ReverseImm);
             continue;
@@ -636,19 +766,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
-      if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
-                           MI.getOpcode() == AMDGPU::COPY)) {
-        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+      if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                            MI.getOpcode() == AMDGPU::COPY)) {
+        if (auto *NextMI = matchSwap(MI)) {
           Next = NextMI->getIterator();
           continue;
         }
       }
 
-      // FIXME: We also need to consider movs of constant operands since
-      // immediate operands are not folded if they have more than one use, and
-      // the operand folding pass is unaware if the immediate will be free since
-      // it won't know if the src == dest constraint will end up being
-      // satisfied.
+      // Try to use S_ADDK_I32 and S_MULK_I32.
       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
           MI.getOpcode() == AMDGPU::S_MUL_I32) {
         const MachineOperand *Dest = &MI.getOperand(0);
@@ -664,13 +790,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         // we have a vector add of a constant, we usually don't get the correct
         // allocation due to the subregister usage.
         if (Dest->getReg().isVirtual() && Src0->isReg()) {
-          MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
-          MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+          MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+          MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
           continue;
         }
 
         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
-          if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
+          if (Src1->isImm() && isKImmOperand(*Src1)) {
             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 
@@ -682,7 +808,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
       // Try to use s_cmpk_*
       if (MI.isCompare() && TII->isSOPC(MI)) {
-        shrinkScalarCompare(TII, MI);
+        shrinkScalarCompare(MI);
         continue;
       }
 
@@ -693,9 +819,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         if (Src.isImm() && Dst.getReg().isPhysical()) {
           int32_t ReverseImm;
-          if (isKImmOperand(TII, Src))
+          if (isKImmOperand(Src))
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-          else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+          else if (isReverseInlineImm(Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
             Src.setImm(ReverseImm);
           }
@@ -708,47 +834,70 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
           MI.getOpcode() == AMDGPU::S_OR_B32 ||
           MI.getOpcode() == AMDGPU::S_XOR_B32) {
-        if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+        if (shrinkScalarLogicOp(MI))
           continue;
       }
 
       if (TII->isMIMG(MI.getOpcode()) &&
-          ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+          ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
           MF.getProperties().hasProperty(
               MachineFunctionProperties::Property::NoVRegs)) {
         shrinkMIMG(MI);
         continue;
       }
 
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+      if (!TII->isVOP3(MI))
+        continue;
+
+      if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+          MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
+          MI.getOpcode() == AMDGPU::V_FMA_F16_e64) {
+        shrinkMadFma(MI);
         continue;
+      }
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
+        // If there is no chance we will shrink it and use VCC as sdst to get
+        // a 32 bit form try to replace dead sdst with NULL.
+        tryReplaceDeadSDST(MI);
+        continue;
+      }
 
-      if (!TII->canShrink(MI, MRI)) {
+      if (!TII->canShrink(MI, *MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
-            !TII->canShrink(MI, MRI))
+            !TII->canShrink(MI, *MRI)) {
+          tryReplaceDeadSDST(MI);
           continue;
+        }
       }
 
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
-        Register DstReg = MI.getOperand(0).getReg();
-        if (DstReg.isVirtual()) {
-          // VOPC instructions can only write to the VCC register. We can't
-          // force them to use VCC here, because this is only one register and
-          // cannot deal with sequences which would require multiple copies of
-          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
-          //
-          // So, instead of forcing the instruction to write to VCC, we provide
-          // a hint to the register allocator to use VCC and then we will run
-          // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
-          continue;
+        MachineOperand &Op0 = MI.getOperand(0);
+        if (Op0.isReg()) {
+          // Exclude VOPCX instructions as these don't explicitly write a
+          // dst.
+          Register DstReg = Op0.getReg();
+          if (DstReg.isVirtual()) {
+            // VOPC instructions can only write to the VCC register. We can't
+            // force them to use VCC here, because this is only one register and
+            // cannot deal with sequences which would require multiple copies of
+            // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+            //
+            // So, instead of forcing the instruction to write to VCC, we
+            // provide a hint to the register allocator to use VCC and then we
+            // will run this pass again after RA and shrink it if it outputs to
+            // VCC.
+            MRI->setRegAllocationHint(DstReg, 0, VCCReg);
+            continue;
+          }
+          if (DstReg != VCCReg)
+            continue;
         }
-        if (DstReg != VCCReg)
-          continue;
       }
 
       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
@@ -760,7 +909,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
         Register SReg = Src2->getReg();
         if (SReg.isVirtual()) {
-          MRI.setRegAllocationHint(SReg, 0, VCCReg);
+          MRI->setRegAllocationHint(SReg, 0, VCCReg);
           continue;
         }
         if (SReg != VCCReg)
@@ -776,7 +925,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         if (SDst->getReg() != VCCReg) {
           if (SDst->getReg().isVirtual())
-            MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+            MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
           Next = true;
         }
 
@@ -786,7 +935,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
                                                           AMDGPU::OpName::src2);
         if (Src2 && Src2->getReg() != VCCReg) {
           if (Src2->getReg().isVirtual())
-            MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+            MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
           Next = true;
         }
 
@@ -801,14 +950,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       ++NumInstructionsShrunk;
 
       // Copy extra operands not present in the instruction definition.
-      copyExtraImplicitOps(*Inst32, MF, MI);
+      copyExtraImplicitOps(*Inst32, MI);
 
       // Copy deadness from the old explicit vcc def to the new implicit def.
       if (SDst && SDst->isDead())
         Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
 
       MI.eraseFromParent();
-      foldImmediates(*Inst32, TII, MRI);
+      foldImmediates(*Inst32);
 
       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
     }