1 files changed, 125 insertions, 15 deletions
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 6ad7dd0e3a7c..7ee178149c7a 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -1,9 +1,8 @@
 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 /// The pass tries to use the 32-bit encoding for instructions when possible.
 //===----------------------------------------------------------------------===//
@@ -39,6 +38,8 @@ class SIShrinkInstructions : public MachineFunctionPass {
 public:
   static char ID;
 
+  void shrinkMIMG(MachineInstr &MI);
+
 public:
   SIShrinkInstructions() : MachineFunctionPass(ID) {
   }
@@ -94,6 +95,10 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
           Src0.setSubReg(0);
           Src0.ChangeToFrameIndex(MovSrc.getIndex());
           ConstantFolded = true;
+        } else if (MovSrc.isGlobal()) {
+          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                          MovSrc.getTargetFlags());
+          ConstantFolded = true;
         }
 
         if (ConstantFolded) {
@@ -212,6 +217,96 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   }
 }
 
+// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
+void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+  if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+    return;
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  int VAddr0Idx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+  unsigned NewAddrDwords = Info->VAddrDwords;
+  const TargetRegisterClass *RC;
+
+  if (Info->VAddrDwords == 2) {
+    RC = &AMDGPU::VReg_64RegClass;
+  } else if (Info->VAddrDwords == 3) {
+    RC = &AMDGPU::VReg_96RegClass;
+  } else if (Info->VAddrDwords == 4) {
+    RC = &AMDGPU::VReg_128RegClass;
+  } else if (Info->VAddrDwords <= 8) {
+    RC = &AMDGPU::VReg_256RegClass;
+    NewAddrDwords = 8;
+  } else {
+    RC = &AMDGPU::VReg_512RegClass;
+    NewAddrDwords = 16;
+  }
+
+  unsigned VgprBase = 0;
+  bool IsUndef = true;
+  bool IsKill = NewAddrDwords == Info->VAddrDwords;
+  for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
+    const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
+    unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
+
+    if (i == 0) {
+      VgprBase = Vgpr;
+    } else if (VgprBase + i != Vgpr)
+      return;
+
+    if (!Op.isUndef())
+      IsUndef = false;
+    if (!Op.isKill())
+      IsKill = false;
+  }
+
+  if (VgprBase + NewAddrDwords > 256)
+    return;
+
+  // Further check for implicit tied operands - this may be present if TFE is
+  // enabled
+  int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
+  int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
+  unsigned TFEVal = MI.getOperand(TFEIdx).getImm();
+  unsigned LWEVal = MI.getOperand(LWEIdx).getImm();
+  int ToUntie = -1;
+  if (TFEVal || LWEVal) {
+    // TFE/LWE is enabled so we need to deal with an implicit tied operand
+    for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
+      if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
+          MI.getOperand(i).isImplicit()) {
+        // This is the tied operand
+        assert(
+            ToUntie == -1 &&
+            "found more than one tied implicit operand when expecting only 1");
+        ToUntie = i;
+        MI.untieRegOperand(ToUntie);
+      }
+    }
+  }
+
+  unsigned NewOpcode =
+      AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
+                            Info->VDataDwords, NewAddrDwords);
+  MI.setDesc(TII->get(NewOpcode));
+  MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
+  MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
+  MI.getOperand(VAddr0Idx).setIsKill(IsKill);
+
+  for (unsigned i = 1; i < Info->VAddrDwords; ++i)
+    MI.RemoveOperand(VAddr0Idx + 1);
+
+  if (ToUntie >= 0) {
+    MI.tieOperands(
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
+        ToUntie - (Info->VAddrDwords - 1));
+  }
+}
+
 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
@@ -277,7 +372,9 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
         if (Opc == AMDGPU::S_BITSET0_B32 ||
             Opc == AMDGPU::S_BITSET1_B32) {
           Src0->ChangeToImmediate(NewImm);
-          MI.RemoveOperand(2);
+          // Remove the immediate and add the tied input.
+          MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+          MI.tieOperands(0, 2);
         } else {
           SrcImm->setImm(NewImm);
         }
@@ -458,6 +555,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
 
   std::vector<unsigned> I1Defs;
 
@@ -596,6 +694,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
+      if (TII->isMIMG(MI.getOpcode()) &&
+          ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
+          MF.getProperties().hasProperty(
+              MachineFunctionProperties::Property::NoVRegs)) {
+        shrinkMIMG(MI);
+        continue;
+      }
+
       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;
 
@@ -625,10 +731,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           // So, instead of forcing the instruction to write to VCC, we provide
           // a hint to the register allocator to use VCC and then we will run
           // this pass again after RA and shrink it if it outputs to VCC.
-          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
           continue;
         }
-        if (DstReg != AMDGPU::VCC)
+        if (DstReg != VCCReg)
           continue;
       }
 
@@ -641,10 +747,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
         unsigned SReg = Src2->getReg();
         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
-          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+          MRI.setRegAllocationHint(SReg, 0, VCCReg);
           continue;
         }
-        if (SReg != AMDGPU::VCC)
+        if (SReg != VCCReg)
           continue;
       }
 
@@ -657,20 +763,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
                                                         AMDGPU::OpName::src2);
 
       if (SDst) {
-        if (SDst->getReg() != AMDGPU::VCC) {
+        bool Next = false;
+
+        if (SDst->getReg() != VCCReg) {
           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
-            MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
-          continue;
+            MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
+          Next = true;
         }
 
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
-        if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+        if (Src2 && Src2->getReg() != VCCReg) {
           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
-            MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+            MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
+          Next = true;
+        }
 
+        if (Next)
           continue;
-        }
       }
 
       // We can shrink this instruction