1 files changed, 114 insertions, 38 deletions
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4f0913fe62f23..6cba55300a8cd 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk,
 STATISTIC(NumLiteralConstantsFolded,
           "Number of literal constants folded into 32-bit instructions.");
 
-namespace llvm {
-  void initializeSIShrinkInstructionsPass(PassRegistry&);
-}
-
 using namespace llvm;
 
 namespace {
@@ -61,10 +57,8 @@ public:
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
-                      "SI Lower il Copies", false, false)
-INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
-                    "SI Lower il Copies", false, false)
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+                "SI Shrink Instructions", false, false)
 
 char SIShrinkInstructions::ID = 0;
 
@@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     return false;
 
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
-    return false;
-
-  return true;
+  return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
 /// \brief This function checks \p MI for operands defined by a move immediate
@@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
     foldImmediates(MI, TII, MRI, false);
 
 }
 
 // Copy MachineOperand with all flags except setting it as implicit.
-static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
-  assert(!Orig.isImplicit());
-  return MachineOperand::CreateReg(Orig.getReg(),
-                                   Orig.isDef(),
-                                   true,
-                                   Orig.isKill(),
-                                   Orig.isDead(),
-                                   Orig.isUndef(),
-                                   Orig.isEarlyClobber(),
-                                   Orig.getSubReg(),
-                                   Orig.isDebug(),
-                                   Orig.isInternalRead());
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+                                   const MachineOperand &Orig) {
+
+  for (MachineOperand &Use : MI.implicit_operands()) {
+    if (Use.getReg() == AMDGPU::VCC) {
+      Use.setIsUndef(Orig.isUndef());
+      Use.setIsKill(Orig.isKill());
+      return;
+    }
+  }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
 }
 
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+        // If this has a literal constant source that is the same as the
+        // reversed bits of an inline immediate, replace with a bitreverse of
+        // that constant. This saves 4 bytes in the common case of materializing
+        // sign bits.
+
+        // Test if we are after regalloc. We only want to do this after any
+        // optimizations happen because this will confuse them.
+        // XXX - not exactly a check for post-regalloc run.
+        MachineOperand &Src = MI.getOperand(1);
+        if (Src.isImm() &&
+            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+          int64_t Imm = Src.getImm();
+          if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) {
+            int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm));
+            if (ReverseImm >= -16 && ReverseImm <= 64) {
+              MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+              Src.setImm(ReverseImm);
+              continue;
+            }
+          }
+        }
+      }
+
+      // Combine adjacent s_nops to use the immediate operand encoding how long
+      // to wait.
+      //
+      // s_nop N
+      // s_nop M
+      //  =>
+      // s_nop (N + M)
+      if (MI.getOpcode() == AMDGPU::S_NOP &&
+          Next != MBB.end() &&
+          (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+        MachineInstr &NextMI = *Next;
+        // The instruction encodes the amount to wait with an offset of 1,
+        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+        // after adding.
+        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+        // Make sure we don't overflow the bounds.
+        if (Nop0 + Nop1 <= 8) {
+          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+          MI.eraseFromParent();
+        }
+
+        continue;
+      }
+
+      // FIXME: We also need to consider movs of constant operands since
+      // immediate operands are not folded if they have more than one use, and
+      // the operand folding pass is unaware if the immediate will be free since
+      // it won't know if the src == dest constraint will end up being
+      // satisfied.
+      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+        const MachineOperand &Dest = MI.getOperand(0);
+        const MachineOperand &Src0 = MI.getOperand(1);
+        const MachineOperand &Src1 = MI.getOperand(2);
+
+        // FIXME: This could work better if hints worked with subregisters. If
+        // we have a vector add of a constant, we usually don't get the correct
+        // allocation due to the subregister usage.
+        if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
+            Src0.isReg()) {
+          MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+          continue;
+        }
+
+        if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
+          if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+            MI.setDesc(TII->get(Opc));
+            MI.tieOperands(0, 1);
+          }
+        }
+      }
+
       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
         const MachineOperand &Src = MI.getOperand(1);
 
-        if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
-            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-        }
+        if (Src.isImm() && isKImmOperand(TII, Src))
+          MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 
         continue;
       }
@@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (!canShrink(MI, TII, TRI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
             !canShrink(MI, TII, TRI, MRI))
           continue;
       }
@@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 
-      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
       // For VOPC instructions, this is replaced by an implicit def of vcc.
-      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
       if (Op32DstIdx != -1) {
         // dst
         Inst32.addOperand(MI.getOperand(0));
@@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           Inst32.addOperand(*Src2);
         } else {
           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
-          // replaced with an implicit read of vcc.
-          assert(Src2->getReg() == AMDGPU::VCC &&
-                 "Unexpected missing register operand");
-          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+          // replaced with an implicit read of vcc. This was already added
+          // during the initial BuildMI, so find it to preserve the flags.
+          copyFlagsToImplicitVCC(*Inst32, *Src2);
         }
       }