1 files changed, 148 insertions, 16 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 338cabcb906bc..f4e8669583699 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -35,13 +35,16 @@ struct FoldCandidate {
     uint64_t ImmToFold;
     int FrameIndexToFold;
   };
+  int ShrinkOpcode;
   unsigned char UseOpNo;
   MachineOperand::MachineOperandType Kind;
   bool Commuted;
 
   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
-                bool Commuted_ = false) :
-    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
+                bool Commuted_ = false,
+                int ShrinkOp = -1) :
+    UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
+    Kind(FoldOp->getType()),
     Commuted(Commuted_) {
     if (FoldOp->isImm()) {
       ImmToFold = FoldOp->getImm();
@@ -68,6 +71,14 @@ struct FoldCandidate {
   bool isCommuted() const {
     return Commuted;
   }
+
+  bool needsShrink() const {
+    return ShrinkOpcode != -1;
+  }
+
+  int getShrinkOpcode() const {
+    return ShrinkOpcode;
+  }
 };
 
 class SIFoldOperands : public MachineFunctionPass {
@@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
 }
 
 static bool updateOperand(FoldCandidate &Fold,
+                          const SIInstrInfo &TII,
                           const TargetRegisterInfo &TRI) {
   MachineInstr *MI = Fold.UseMI;
   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
@@ -189,10 +201,49 @@ static bool updateOperand(FoldCandidate &Fold,
         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
       }
     }
+
+    if (Fold.needsShrink()) {
+      MachineBasicBlock *MBB = MI->getParent();
+      auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+      if (Liveness != MachineBasicBlock::LQR_Dead)
+        return false;
+
+      MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+      int Op32 = Fold.getShrinkOpcode();
+      MachineOperand &Dst0 = MI->getOperand(0);
+      MachineOperand &Dst1 = MI->getOperand(1);
+      assert(Dst0.isDef() && Dst1.isDef());
+
+      bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+
+      const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+      unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+      const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
+      unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+
+      MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+
+      if (HaveNonDbgCarryUse) {
+        BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+          .addReg(AMDGPU::VCC, RegState::Kill);
+      }
+
+      // Keep the old instruction around to avoid breaking iterators, but
+      // replace the outputs with dummy registers.
+      Dst0.setReg(NewReg0);
+      Dst1.setReg(NewReg1);
+
+      if (Fold.isCommuted())
+        TII.commuteInstruction(*Inst32, false);
+      return true;
+    }
+
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
 
+  assert(!Fold.needsShrink() && "not handled");
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -261,6 +312,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     if (isUseMIInFoldList(FoldList, MI))
       return false;
 
+    unsigned CommuteOpNo = OpNo;
+
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
@@ -269,11 +322,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
     if (CanCommute) {
       if (CommuteIdx0 == OpNo)
-        OpNo = CommuteIdx1;
+        CommuteOpNo = CommuteIdx1;
       else if (CommuteIdx1 == OpNo)
-        OpNo = CommuteIdx0;
+        CommuteOpNo = CommuteIdx0;
     }
 
+
     // One of operands might be an Imm operand, and OpNo may refer to it after
     // the call of commuteInstruction() below. Such situations are avoided
     // here explicitly as OpNo must be a register operand to be a candidate
@@ -286,12 +340,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+    if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
+      if ((Opc == AMDGPU::V_ADD_I32_e64 ||
+           Opc == AMDGPU::V_SUB_I32_e64 ||
+           Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
+          OpToFold->isImm()) {
+        MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+        // Verify the other operand is a VGPR, otherwise we would violate the
+        // constant bus restriction.
+        unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
+        MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+        if (!OtherOp.isReg() ||
+            !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
+          return false;
+
+        assert(MI->getOperand(1).isDef());
+
+        int Op32 =  AMDGPU::getVOPe32(Opc);
+        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
+                                         Op32));
+        return true;
+      }
+
       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
       return false;
     }
 
-    FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
+    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
     return true;
   }
 
@@ -362,8 +438,6 @@ void SIFoldOperands::foldOperand(
 
   bool FoldingImm = OpToFold.isImm();
 
-  // In order to fold immediates into copies, we need to change the
-  // copy to a MOV.
   if (FoldingImm && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
@@ -371,6 +445,31 @@ void SIFoldOperands::foldOperand(
       MRI->getRegClass(DestReg) :
       TRI->getPhysRegClass(DestReg);
 
+    unsigned SrcReg  = UseMI->getOperand(1).getReg();
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
+      TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
+      if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
+        MachineRegisterInfo::use_iterator NextUse;
+        SmallVector<FoldCandidate, 4> CopyUses;
+        for (MachineRegisterInfo::use_iterator
+          Use = MRI->use_begin(DestReg), E = MRI->use_end();
+          Use != E; Use = NextUse) {
+          NextUse = std::next(Use);
+          FoldCandidate FC = FoldCandidate(Use->getParent(),
+           Use.getOperandNo(), &UseMI->getOperand(1));
+          CopyUses.push_back(FC);
+       }
+        for (auto & F : CopyUses) {
+          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
+           FoldList, CopiesToReplace);
+        }
+      }
+    }
+
+    // In order to fold immediates into copies, we need to change the
+    // copy to a MOV.
+
     unsigned MovOp = TII->getMovOpcode(DestRC);
     if (MovOp == AMDGPU::COPY)
       return;
@@ -378,6 +477,20 @@ void SIFoldOperands::foldOperand(
     UseMI->setDesc(TII->get(MovOp));
     CopiesToReplace.push_back(UseMI);
   } else {
+    if (UseMI->isCopy() && OpToFold.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
+        TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+        TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
+        !UseMI->getOperand(1).getSubReg()) {
+      UseMI->getOperand(1).setReg(OpToFold.getReg());
+      UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+      UseMI->getOperand(1).setIsKill(false);
+      CopiesToReplace.push_back(UseMI);
+      OpToFold.setIsKill(false);
+      return;
+    }
+
     const MCInstrDesc &UseDesc = UseMI->getDesc();
 
     // Don't fold into target independent nodes.  Target independent opcodes
@@ -550,6 +663,19 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
+  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
+    if (Src0->isImm() && Src0->getImm() == 0) {
+      // v_lshl_or_b32 0, X, Y -> copy Y
+      // v_lshl_or_b32 0, X, K -> v_mov_b32 K
+      bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
+      MI->RemoveOperand(Src1Idx);
+      MI->RemoveOperand(Src0Idx);
+
+      MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
+      return true;
+    }
+  }
+
   // and k0, k1 -> v_mov_b32 (k0 & k1)
   // or k0, k1 -> v_mov_b32 (k0 | k1)
   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
@@ -728,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     }
   } else {
     // Folding register.
+    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
     for (MachineRegisterInfo::use_iterator
            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
          Use != E; ++Use) {
-      MachineInstr *UseMI = Use->getParent();
+      UsesToProcess.push_back(Use);
+    }
+    for (auto U : UsesToProcess) {
+      MachineInstr *UseMI = U->getParent();
 
-      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
-                  FoldList, CopiesToReplace);
+      foldOperand(OpToFold, UseMI, U.getOperandNo(),
+        FoldList, CopiesToReplace);
     }
   }
 
@@ -744,7 +874,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     Copy->addImplicitDefUseOperands(*MF);
 
   for (FoldCandidate &Fold : FoldList) {
-    if (updateOperand(Fold, *TRI)) {
+    if (updateOperand(Fold, *TII, *TRI)) {
       // Clear kill flags.
       if (Fold.isReg()) {
         assert(Fold.OpToFold && Fold.OpToFold->isReg());
@@ -981,9 +1111,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.
   //
-  // TODO: Check nsz on instructions when fast math flags are preserved to MI
-  // level.
-  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+  bool IsIEEEMode = ST->enableIEEEBit(MF);
+  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
@@ -994,7 +1123,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       tryFoldInst(TII, &MI);
 
       if (!TII->isFoldableCopy(MI)) {
-        if (IsIEEEMode || !tryFoldOMod(MI))
+        // TODO: Omod might be OK if there is NSZ only on the source
+        // instruction, and not the omod multiply.
+        if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
+            !tryFoldOMod(MI))
           tryFoldClamp(MI);
         continue;
       }