diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 435 |
1 files changed, 292 insertions, 143 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index c8f1daf26de9..05d2dd000162 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -26,15 +26,40 @@ using namespace llvm; namespace { class SIShrinkInstructions : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const GCNSubtarget *ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + public: static char ID; - void shrinkMIMG(MachineInstr &MI); - public: SIShrinkInstructions() : MachineFunctionPass(ID) { } + bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; + bool isKImmOperand(const MachineOperand &Src) const; + bool isKUImmOperand(const MachineOperand &Src) const; + bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; + bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; + void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; + void shrinkScalarCompare(MachineInstr &MI) const; + void shrinkMIMG(MachineInstr &MI) const; + void shrinkMadFma(MachineInstr &MI) const; + bool shrinkScalarLogicOp(MachineInstr &MI) const; + bool tryReplaceDeadSDST(MachineInstr &MI) const; + bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, + Register Reg, unsigned SubReg) const; + bool instReadsReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const; + bool instModifiesReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const; + TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, + unsigned I) const; + void dropInstructionKeepingImpDefs(MachineInstr &MI) const; + MachineInstr *matchSwap(MachineInstr &MovT) const; + bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "SI Shrink Instructions"; } @@ -59,8 +84,8 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() { /// This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. -static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, - MachineRegisterInfo &MRI, bool TryToCommute = true) { +bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, + bool TryToCommute) const { assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -69,8 +94,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { Register Reg = Src0.getReg(); - if (Reg.isVirtual() && MRI.hasOneUse(Reg)) { - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Reg.isVirtual()) { + MachineInstr *Def = MRI->getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; @@ -91,8 +116,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } if (ConstantFolded) { - assert(MRI.use_empty(Reg)); - Def->eraseFromParent(); + if (MRI->use_nodbg_empty(Reg)) + Def->eraseFromParent(); ++NumLiteralConstantsFolded; return true; } @@ -103,7 +128,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, // We have failed to fold src0, so commute the instruction and try again. if (TryToCommute && MI.isCommutable()) { if (TII->commuteInstruction(MI)) { - if (foldImmediates(MI, TII, MRI, false)) + if (foldImmediates(MI, false)) return true; // Commute back. @@ -114,21 +139,20 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return false; } -static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { +bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { return isInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), Src.getParent()->getOperandNo(&Src)); } -static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { +bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), Src.getParent()->getOperandNo(&Src)); } -static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, - const MachineOperand &Src, - bool &IsUnsigned) { +bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, + bool &IsUnsigned) const { if (isInt<16>(Src.getImm())) { IsUnsigned = false; return !TII->isInlineConstant(Src); @@ -144,9 +168,8 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, /// \returns true if the constant in \p Src should be replaced with a bitreverse /// of an inline immediate. -static bool isReverseInlineImm(const SIInstrInfo *TII, - const MachineOperand &Src, - int32_t &ReverseImm) { +bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, + int32_t &ReverseImm) const { if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) return false; @@ -156,8 +179,9 @@ static bool isReverseInlineImm(const SIInstrInfo *TII, /// Copy implicit register operands from specified instruction to this /// instruction that are not part of the instruction definition. -static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, - const MachineInstr &MI) { +void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, + MachineInstr &MI) const { + MachineFunction &MF = *MI.getMF(); for (unsigned i = MI.getDesc().getNumOperands() + MI.getDesc().getNumImplicitUses() + MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); @@ -168,7 +192,7 @@ static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, } } -static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { +void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to // get constants on the RHS. if (!MI.getOperand(0).isReg()) @@ -191,7 +215,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { // and initially selected to the unsigned versions. if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { bool HasUImm; - if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { + if (isKImmOrKUImmOperand(Src1, HasUImm)) { if (!HasUImm) { SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; @@ -205,22 +229,30 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { const MCInstrDesc &NewDesc = TII->get(SOPKOpc); - if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || - (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { + if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || + (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { MI.setDesc(NewDesc); } } // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. -void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { +void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info) return; - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + uint8_t NewEncoding; + switch (Info->MIMGEncoding) { + case AMDGPU::MIMGEncGfx10NSA: + NewEncoding = AMDGPU::MIMGEncGfx10Default; + break; + case AMDGPU::MIMGEncGfx11NSA: + NewEncoding = AMDGPU::MIMGEncGfx11Default; + break; + default: + return; + } + int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); unsigned NewAddrDwords = Info->VAddrDwords; @@ -246,16 +278,23 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { } unsigned VgprBase = 0; + unsigned NextVgpr = 0; bool IsUndef = true; bool IsKill = NewAddrDwords == Info->VAddrDwords; - for (unsigned i = 0; i < Info->VAddrDwords; ++i) { - const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); - unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); + for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); + unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); + unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; + assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); - if (i == 0) { + if (Idx == 0) { VgprBase = Vgpr; - } else if (VgprBase + i != Vgpr) + NextVgpr = Vgpr + Dwords; + } else if (Vgpr == NextVgpr) { + NextVgpr = Vgpr + Dwords; + } else { return; + } if (!Op.isUndef()) IsUndef = false; @@ -288,21 +327,108 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { } } - unsigned NewOpcode = - AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, - Info->VDataDwords, NewAddrDwords); + unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, + Info->VDataDwords, NewAddrDwords); MI.setDesc(TII->get(NewOpcode)); MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); MI.getOperand(VAddr0Idx).setIsKill(IsKill); - for (unsigned i = 1; i < Info->VAddrDwords; ++i) - MI.RemoveOperand(VAddr0Idx + 1); + for (int i = 1; i < Info->VAddrOperands; ++i) + MI.removeOperand(VAddr0Idx + 1); if (ToUntie >= 0) { MI.tieOperands( AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), - ToUntie - (Info->VAddrDwords - 1)); + ToUntie - (Info->VAddrOperands - 1)); + } +} + +// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. +void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { + if (!ST->hasVOP3Literal()) + return; + + if (TII->hasAnyModifiersSet(MI)) + return; + + const unsigned Opcode = MI.getOpcode(); + MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; + + bool Swap; + + // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. + if (Src2.isImm() && !TII->isInlineConstant(Src2)) { + if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) + Swap = false; + else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) + Swap = true; + else + return; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected mad/fma opcode!"); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADAK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAAK_F32; + break; + case AMDGPU::V_MAD_F16_e64: + NewOpcode = AMDGPU::V_MADAK_F16; + break; + case AMDGPU::V_FMA_F16_e64: + NewOpcode = AMDGPU::V_FMAAK_F16; + break; + } + } + + // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. + if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { + if (Src1.isImm() && !TII->isInlineConstant(Src1)) + Swap = false; + else if (Src0.isImm() && !TII->isInlineConstant(Src0)) + Swap = true; + else + return; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected mad/fma opcode!"); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADMK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAMK_F32; + break; + case AMDGPU::V_MAD_F16_e64: + NewOpcode = AMDGPU::V_MADMK_F16; + break; + case AMDGPU::V_FMA_F16_e64: + NewOpcode = AMDGPU::V_FMAMK_F16; + break; + } + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) + return; + + if (Swap) { + // Swap Src0 and Src1 by building a new instruction. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), + MI.getOperand(0).getReg()) + .add(Src1) + .add(Src0) + .add(Src2) + .setMIFlags(MI.getFlags()); + MI.eraseFromParent(); + } else { + TII->removeModOperands(MI); + MI.setDesc(TII->get(NewOpcode)); } } @@ -311,10 +437,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { /// If the inverse of the immediate is legal, use ANDN2, ORN2 or /// XNOR (as a ^ b == ~(a ^ ~b)). /// \returns true if the caller should continue the machine function iterator -static bool shrinkScalarLogicOp(const GCNSubtarget &ST, - MachineRegisterInfo &MRI, - const SIInstrInfo *TII, - MachineInstr &MI) { +bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); @@ -323,7 +446,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, MachineOperand *SrcImm = Src1; if (!SrcImm->isImm() || - AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) + AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) return false; uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); @@ -333,7 +456,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (isPowerOf2_32(~Imm)) { NewImm = countTrailingOnes(Imm); Opc = AMDGPU::S_BITSET0_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_ANDN2_B32; } @@ -341,12 +464,12 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (isPowerOf2_32(Imm)) { NewImm = countTrailingZeros(Imm); Opc = AMDGPU::S_BITSET1_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_ORN2_B32; } } else if (Opc == AMDGPU::S_XOR_B32) { - if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_XNOR_B32; } @@ -354,16 +477,10 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, llvm_unreachable("unexpected opcode"); } - if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && - SrcImm == Src0) { - if (!TII->commuteInstruction(MI, false, 1, 2)) - NewImm = 0; - } - if (NewImm != 0) { if (Dest->getReg().isVirtual() && SrcReg->isReg()) { - MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); - MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); return true; } @@ -390,19 +507,19 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. -static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, - Register Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { +bool SIShrinkInstructions::instAccessReg( + iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, + unsigned SubReg) const { for (const MachineOperand &MO : R) { if (!MO.isReg()) continue; if (Reg.isPhysical() && MO.getReg().isPhysical()) { - if (TRI.regsOverlap(Reg, MO.getReg())) + if (TRI->regsOverlap(Reg, MO.getReg())) return true; } else if (MO.getReg() == Reg && Reg.isVirtual()) { - LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & - TRI.getSubRegIndexLaneMask(MO.getSubReg()); + LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & + TRI->getSubRegIndexLaneMask(MO.getSubReg()); if (Overlap.any()) return true; } @@ -410,33 +527,31 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, return false; } -static bool instReadsReg(const MachineInstr *MI, - unsigned Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { - return instAccessReg(MI->uses(), Reg, SubReg, TRI); +bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const { + return instAccessReg(MI->uses(), Reg, SubReg); } -static bool instModifiesReg(const MachineInstr *MI, - unsigned Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { - return instAccessReg(MI->defs(), Reg, SubReg, TRI); +bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const { + return instAccessReg(MI->defs(), Reg, SubReg); } -static TargetInstrInfo::RegSubRegPair -getSubRegForIndex(Register Reg, unsigned Sub, unsigned I, - const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { - if (TRI.getRegSizeInBits(Reg, MRI) != 32) { +TargetInstrInfo::RegSubRegPair +SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, + unsigned I) const { + if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { if (Reg.isPhysical()) { - Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); + Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); } else { - Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); + Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); } } return TargetInstrInfo::RegSubRegPair(Reg, Sub); } -static void dropInstructionKeepingImpDefs(MachineInstr &MI, - const SIInstrInfo *TII) { +void SIShrinkInstructions::dropInstructionKeepingImpDefs( + MachineInstr &MI) const { for (unsigned i = MI.getDesc().getNumOperands() + MI.getDesc().getNumImplicitUses() + MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); @@ -464,14 +579,13 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI, // Returns next valid instruction pointer if was able to create v_swap_b32. // // This shall not be done too early not to prevent possible folding which may -// remove matched moves, and this should prefereably be done before RA to +// remove matched moves, and this should preferably be done before RA to // release saved registers and also possibly after RA which can insert copies // too. // -// This is really just a generic peephole that is not a canocical shrinking, +// This is really just a generic peephole that is not a canonical shrinking, // although requirements match the pass placement and it reduces code size too. -static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, - const SIInstrInfo *TII) { +MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || MovT.getOpcode() == AMDGPU::COPY); @@ -486,8 +600,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, unsigned Size = TII->getOpSize(MovT, 0) / 4; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (!TRI.isVGPR(MRI, X)) + if (!TRI->isVGPR(*MRI, X)) return nullptr; if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) @@ -501,7 +614,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { MachineInstr *MovY = &*Iter; - KilledT = MovY->killsRegister(T, &TRI); + KilledT = MovY->killsRegister(T, TRI); if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && MovY->getOpcode() != AMDGPU::COPY) || @@ -514,21 +627,20 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, Register Y = MovY->getOperand(0).getReg(); unsigned Ysub = MovY->getOperand(0).getSubReg(); - if (!TRI.isVGPR(MRI, Y)) + if (!TRI->isVGPR(*MRI, Y)) continue; MachineInstr *MovX = nullptr; for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); I != IY; ++I) { - if (instReadsReg(&*I, X, Xsub, TRI) || - instModifiesReg(&*I, Y, Ysub, TRI) || - instModifiesReg(&*I, T, Tsub, TRI) || - (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { + if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || + instModifiesReg(&*I, T, Tsub) || + (MovX && instModifiesReg(&*I, X, Xsub))) { MovX = nullptr; break; } - if (!instReadsReg(&*I, Y, Ysub, TRI)) { - if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { + if (!instReadsReg(&*I, Y, Ysub)) { + if (!MovX && instModifiesReg(&*I, X, Xsub)) { MovX = nullptr; break; } @@ -559,8 +671,8 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, for (unsigned I = 0; I < Size; ++I) { TargetInstrInfo::RegSubRegPair X1, Y1; - X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); - Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); + X1 = getSubRegForIndex(X, Xsub, I); + Y1 = getSubRegForIndex(Y, Ysub, I); MachineBasicBlock &MBB = *MovT.getParent(); auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), TII->get(AMDGPU::V_SWAP_B32)) @@ -570,23 +682,23 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, .addReg(X1.Reg, 0, X1.SubReg).getInstr(); if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { // Drop implicit EXEC. - MIB->RemoveOperand(MIB->getNumExplicitOperands()); + MIB->removeOperand(MIB->getNumExplicitOperands()); MIB->copyImplicitOps(*MBB.getParent(), *MovX); } } MovX->eraseFromParent(); - dropInstructionKeepingImpDefs(*MovY, TII); + dropInstructionKeepingImpDefs(*MovY); MachineInstr *Next = &*std::next(MovT.getIterator()); - if (T.isVirtual() && MRI.use_nodbg_empty(T)) { - dropInstructionKeepingImpDefs(MovT, TII); + if (T.isVirtual() && MRI->use_nodbg_empty(T)) { + dropInstructionKeepingImpDefs(MovT); } else { Xop.setIsKill(false); for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { unsigned OpNo = MovT.getNumExplicitOperands() + I; const MachineOperand &Op = MovT.getOperand(OpNo); - if (Op.isKill() && TRI.regsOverlap(X, Op.getReg())) - MovT.RemoveOperand(OpNo); + if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) + MovT.removeOperand(OpNo); } } @@ -596,14 +708,32 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, return nullptr; } +// If an instruction has dead sdst replace it with NULL register on gfx1030+ +bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { + if (!ST->hasGFX10_3Insts()) + return false; + + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Op) + return false; + Register SDstReg = Op->getReg(); + if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) + return false; + + Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); + return true; +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + MRI = &MF.getRegInfo(); + ST = &MF.getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; std::vector<unsigned> I1Defs; @@ -628,7 +758,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { int32_t ReverseImm; - if (isReverseInlineImm(TII, Src, ReverseImm)) { + if (isReverseInlineImm(Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); Src.setImm(ReverseImm); continue; @@ -636,19 +766,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } } - if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || - MI.getOpcode() == AMDGPU::COPY)) { - if (auto *NextMI = matchSwap(MI, MRI, TII)) { + if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MI.getOpcode() == AMDGPU::COPY)) { + if (auto *NextMI = matchSwap(MI)) { Next = NextMI->getIterator(); continue; } } - // FIXME: We also need to consider movs of constant operands since - // immediate operands are not folded if they have more than one use, and - // the operand folding pass is unaware if the immediate will be free since - // it won't know if the src == dest constraint will end up being - // satisfied. + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { const MachineOperand *Dest = &MI.getOperand(0); @@ -664,13 +790,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. if (Dest->getReg().isVirtual() && Src0->isReg()) { - MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); - MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); + MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); + MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { - if (Src1->isImm() && isKImmOperand(TII, *Src1)) { + if (Src1->isImm() && isKImmOperand(*Src1)) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; @@ -682,7 +808,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // Try to use s_cmpk_* if (MI.isCompare() && TII->isSOPC(MI)) { - shrinkScalarCompare(TII, MI); + shrinkScalarCompare(MI); continue; } @@ -693,9 +819,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src.isImm() && Dst.getReg().isPhysical()) { int32_t ReverseImm; - if (isKImmOperand(TII, Src)) + if (isKImmOperand(Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - else if (isReverseInlineImm(TII, Src, ReverseImm)) { + else if (isReverseInlineImm(Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); Src.setImm(ReverseImm); } @@ -708,47 +834,70 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::S_AND_B32 || MI.getOpcode() == AMDGPU::S_OR_B32 || MI.getOpcode() == AMDGPU::S_XOR_B32) { - if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + if (shrinkScalarLogicOp(MI)) continue; } if (TII->isMIMG(MI.getOpcode()) && - ST.getGeneration() >= AMDGPUSubtarget::GFX10 && + ST->getGeneration() >= AMDGPUSubtarget::GFX10 && MF.getProperties().hasProperty( MachineFunctionProperties::Property::NoVRegs)) { shrinkMIMG(MI); continue; } - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + if (!TII->isVOP3(MI)) + continue; + + if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || + MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { + shrinkMadFma(MI); continue; + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { + // If there is no chance we will shrink it and use VCC as sdst to get + // a 32 bit form try to replace dead sdst with NULL. + tryReplaceDeadSDST(MI); + continue; + } - if (!TII->canShrink(MI, MRI)) { + if (!TII->canShrink(MI, *MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(MI) || - !TII->canShrink(MI, MRI)) + !TII->canShrink(MI, *MRI)) { + tryReplaceDeadSDST(MI); continue; + } } int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - Register DstReg = MI.getOperand(0).getReg(); - if (DstReg.isVirtual()) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because this is only one register and - // cannot deal with sequences which would require multiple copies of - // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) - // - // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we will run - // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); - continue; + MachineOperand &Op0 = MI.getOperand(0); + if (Op0.isReg()) { + // Exclude VOPCX instructions as these don't explicitly write a + // dst. + Register DstReg = Op0.getReg(); + if (DstReg.isVirtual()) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) + // + // So, instead of forcing the instruction to write to VCC, we + // provide a hint to the register allocator to use VCC and then we + // will run this pass again after RA and shrink it if it outputs to + // VCC. + MRI->setRegAllocationHint(DstReg, 0, VCCReg); + continue; + } + if (DstReg != VCCReg) + continue; } - if (DstReg != VCCReg) - continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { @@ -760,7 +909,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; Register SReg = Src2->getReg(); if (SReg.isVirtual()) { - MRI.setRegAllocationHint(SReg, 0, VCCReg); + MRI->setRegAllocationHint(SReg, 0, VCCReg); continue; } if (SReg != VCCReg) @@ -776,7 +925,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (SDst->getReg() != VCCReg) { if (SDst->getReg().isVirtual()) - MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); + MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); Next = true; } @@ -786,7 +935,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { AMDGPU::OpName::src2); if (Src2 && Src2->getReg() != VCCReg) { if (Src2->getReg().isVirtual()) - MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); + MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); Next = true; } @@ -801,14 +950,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { ++NumInstructionsShrunk; // Copy extra operands not present in the instruction definition. - copyExtraImplicitOps(*Inst32, MF, MI); + copyExtraImplicitOps(*Inst32, MI); // Copy deadness from the old explicit vcc def to the new implicit def. if (SDst && SDst->isDead()) Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); MI.eraseFromParent(); - foldImmediates(*Inst32, TII, MRI); + foldImmediates(*Inst32); LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } |
