diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 436 |
1 files changed, 352 insertions, 84 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 747f9fe2f8ae..d24c7da964ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -21,6 +21,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -71,6 +72,13 @@ void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); } +// Return the wave level SGPR base address if this is a wave address. +static Register getWaveAddress(const MachineInstr *Def) { + return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS + ? Def->getOperand(1).getReg() + : Register(); +} + bool AMDGPUInstructionSelector::isVCC(Register Reg, const MachineRegisterInfo &MRI) const { // The verifier is oblivious to s1 being a valid value for wavesize registers. @@ -158,11 +166,15 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { // TODO: Skip masking high bits if def is known boolean. + bool IsSGPR = TRI.isSGPRClass(SrcRC); unsigned AndOpc = - TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; - BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) + IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; + auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) .addImm(1) .addReg(SrcReg); + if (IsSGPR) + And.setOperandDead(3); // Dead scc + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) .addImm(0) .addReg(MaskedReg); @@ -322,7 +334,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineInstr *Add = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) .add(I.getOperand(1)) - .add(I.getOperand(2)); + .add(I.getOperand(2)) + .setOperandDead(3); // Dead scc I.eraseFromParent(); return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); } @@ -369,7 +382,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(Lo2); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) .add(Hi1) - .add(Hi2); + .add(Hi2) + .setOperandDead(3); // Dead scc } else { const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); Register CarryReg = MRI->createVirtualRegister(CarryRC); @@ -436,14 +450,18 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) + auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) .add(I.getOperand(2)) .add(I.getOperand(3)); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) - .addReg(AMDGPU::SCC); - if (!MRI->getRegClassOrNull(Dst1Reg)) - MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + if (MRI->use_nodbg_empty(Dst1Reg)) { + CarryInst.setOperandDead(3); // Dead scc + } else { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) + .addReg(AMDGPU::SCC); + if (!MRI->getRegClassOrNull(Dst1Reg)) + MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + } if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || @@ -740,7 +758,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc MI.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); @@ -1001,7 +1020,7 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { } bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { - unsigned IntrinsicID = I.getIntrinsicID(); + unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_if_break: { MachineBasicBlock *BB = I.getParent(); @@ -1192,36 +1211,104 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, } } - if (Size != 32) - return -1; + if (Size == 32) { + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U32; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U32; + case CmpInst::ICMP_SGT: + return AMDGPU::S_CMP_GT_I32; + case CmpInst::ICMP_SGE: + return AMDGPU::S_CMP_GE_I32; + case CmpInst::ICMP_SLT: + return AMDGPU::S_CMP_LT_I32; + case CmpInst::ICMP_SLE: + return AMDGPU::S_CMP_LE_I32; + case CmpInst::ICMP_UGT: + return AMDGPU::S_CMP_GT_U32; + case CmpInst::ICMP_UGE: + return AMDGPU::S_CMP_GE_U32; + case CmpInst::ICMP_ULT: + return AMDGPU::S_CMP_LT_U32; + case CmpInst::ICMP_ULE: + return AMDGPU::S_CMP_LE_U32; + case CmpInst::FCMP_OEQ: + return AMDGPU::S_CMP_EQ_F32; + case CmpInst::FCMP_OGT: + return AMDGPU::S_CMP_GT_F32; + case CmpInst::FCMP_OGE: + return AMDGPU::S_CMP_GE_F32; + case CmpInst::FCMP_OLT: + return AMDGPU::S_CMP_LT_F32; + case CmpInst::FCMP_OLE: + return AMDGPU::S_CMP_LE_F32; + case CmpInst::FCMP_ONE: + return AMDGPU::S_CMP_LG_F32; + case CmpInst::FCMP_ORD: + return AMDGPU::S_CMP_O_F32; + case CmpInst::FCMP_UNO: + return AMDGPU::S_CMP_U_F32; + case CmpInst::FCMP_UEQ: + return AMDGPU::S_CMP_NLG_F32; + case CmpInst::FCMP_UGT: + return AMDGPU::S_CMP_NLE_F32; + case CmpInst::FCMP_UGE: + return AMDGPU::S_CMP_NLT_F32; + case CmpInst::FCMP_ULT: + return AMDGPU::S_CMP_NGE_F32; + case CmpInst::FCMP_ULE: + return AMDGPU::S_CMP_NGT_F32; + case CmpInst::FCMP_UNE: + return AMDGPU::S_CMP_NEQ_F32; + default: + llvm_unreachable("Unknown condition code!"); + } + } - switch (P) { - case CmpInst::ICMP_NE: - return AMDGPU::S_CMP_LG_U32; - case CmpInst::ICMP_EQ: - return AMDGPU::S_CMP_EQ_U32; - case CmpInst::ICMP_SGT: - return AMDGPU::S_CMP_GT_I32; - case CmpInst::ICMP_SGE: - return AMDGPU::S_CMP_GE_I32; - case CmpInst::ICMP_SLT: - return AMDGPU::S_CMP_LT_I32; - case CmpInst::ICMP_SLE: - return AMDGPU::S_CMP_LE_I32; - case CmpInst::ICMP_UGT: - return AMDGPU::S_CMP_GT_U32; - case CmpInst::ICMP_UGE: - return AMDGPU::S_CMP_GE_U32; - case CmpInst::ICMP_ULT: - return AMDGPU::S_CMP_LT_U32; - case CmpInst::ICMP_ULE: - return AMDGPU::S_CMP_LE_U32; - default: - llvm_unreachable("Unknown condition code!"); + if (Size == 16) { + if (!STI.hasSALUFloatInsts()) + return -1; + + switch (P) { + case CmpInst::FCMP_OEQ: + return AMDGPU::S_CMP_EQ_F16; + case CmpInst::FCMP_OGT: + return AMDGPU::S_CMP_GT_F16; + case CmpInst::FCMP_OGE: + return AMDGPU::S_CMP_GE_F16; + case CmpInst::FCMP_OLT: + return AMDGPU::S_CMP_LT_F16; + case CmpInst::FCMP_OLE: + return AMDGPU::S_CMP_LE_F16; + case CmpInst::FCMP_ONE: + return AMDGPU::S_CMP_LG_F16; + case CmpInst::FCMP_ORD: + return AMDGPU::S_CMP_O_F16; + case CmpInst::FCMP_UNO: + return AMDGPU::S_CMP_U_F16; + case CmpInst::FCMP_UEQ: + return AMDGPU::S_CMP_NLG_F16; + case CmpInst::FCMP_UGT: + return AMDGPU::S_CMP_NLE_F16; + case CmpInst::FCMP_UGE: + return AMDGPU::S_CMP_NLT_F16; + case CmpInst::FCMP_ULT: + return AMDGPU::S_CMP_NGE_F16; + case CmpInst::FCMP_ULE: + return AMDGPU::S_CMP_NGT_F16; + case CmpInst::FCMP_UNE: + return AMDGPU::S_CMP_NEQ_F16; + default: + llvm_unreachable("Unknown condition code!"); + } } + + return -1; } -bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); @@ -1247,6 +1334,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { return Ret; } + if (I.getOpcode() == AMDGPU::G_FCMP) + return false; + int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); if (Opcode == -1) return false; @@ -1569,8 +1659,8 @@ static unsigned gwsIntrinToOpcode(unsigned IntrID) { bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const { - if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && - !STI.hasGWSSemaReleaseAll()) + if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !STI.hasGWSSemaReleaseAll())) return false; // intrinsic ID, vsrc, offset @@ -1629,7 +1719,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) .addReg(BaseOffset) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0Base); @@ -1690,7 +1781,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, } bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { - if (TM.getOptLevel() > CodeGenOpt::None) { + if (TM.getOptLevel() > CodeGenOptLevel::None) { unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; if (WGSize <= STI.getWavefrontSize()) { MachineBasicBlock *MBB = MI.getParent(); @@ -2008,7 +2099,7 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { - unsigned IntrinsicID = I.getIntrinsicID(); + unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_end_cf: return selectEndCfIntrinsic(I); @@ -2194,7 +2285,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { } else { BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) .addReg(HiReg) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc } unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; @@ -2203,12 +2295,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) .addImm(0xffff); - BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) + auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) .addReg(LoReg) .addReg(ImmReg); - BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) + auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) .addReg(TmpReg0) .addReg(TmpReg1); + + if (!IsVALU) { + And.setOperandDead(3); // Dead scc + Or.setOperandDead(3); // Dead scc + } } I.eraseFromParent(); @@ -2353,7 +2450,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { if (Signed) { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) .addReg(SrcReg, 0, SubReg) - .addImm(31); + .addImm(31) + .setOperandDead(3); // Dead scc } else { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) .addImm(0); @@ -2397,7 +2495,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { if (!Signed && shouldUseAndMask(SrcSize, Mask)) { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) .addReg(SrcReg) - .addImm(Mask); + .addImm(Mask) + .setOperandDead(3); // Dead scc } else { BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) .addReg(SrcReg) @@ -2411,16 +2510,54 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return false; } +static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register LShlSrc; + if (mi_match(In, MRI, + m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) { + Out = LShlSrc; + return true; + } + return false; +} + +bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { + if (!Subtarget->hasSALUFloatInsts()) + return false; + + Register Dst = I.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); + if (DstRB->getID() != AMDGPU::SGPRRegBankID) + return false; + + Register Src = I.getOperand(1).getReg(); + + if (MRI->getType(Dst) == LLT::scalar(32) && + MRI->getType(Src) == LLT::scalar(16)) { + if (isExtractHiElt(*MRI, Src, Src)) { + MachineBasicBlock *BB = I.getParent(); + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) + .addUse(Src); + I.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + + return false; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineOperand &ImmOp = I.getOperand(1); Register DstReg = I.getOperand(0).getReg(); unsigned Size = MRI->getType(DstReg).getSizeInBits(); + bool IsFP = false; // The AMDGPU backend only supports Imm operands and not CImm or FPImm. if (ImmOp.isFPImm()) { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); + IsFP = true; } else if (ImmOp.isCImm()) { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); } else { @@ -2433,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { unsigned Opcode; if (DstRB->getID() == AMDGPU::VCCRegBankID) { Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + } else if (Size == 64 && + AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) { + Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO; + I.setDesc(TII.get(Opcode)); + I.addImplicitDefUseOperands(*MF); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } else { Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; @@ -2531,7 +2674,8 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) .addReg(HiReg) - .addReg(ConstReg); + .addReg(ConstReg) + .setOperandDead(3); // Dead scc BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) .addReg(LoReg) .addImm(AMDGPU::sub0) @@ -2572,7 +2716,8 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { // TODO: Should this used S_BITSET0_*? BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) .addReg(HiReg) - .addReg(ConstReg); + .addReg(ConstReg) + .setOperandDead(3); // Dead scc BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) .addReg(LoReg) .addImm(AMDGPU::sub0) @@ -2689,8 +2834,8 @@ static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { return isVCmpResult(MI.getOperand(1).getReg(), MRI) && isVCmpResult(MI.getOperand(2).getReg(), MRI); - if (Opcode == TargetOpcode::G_INTRINSIC) - return MI.getIntrinsicID() == Intrinsic::amdgcn_class; + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) + return GI->is(Intrinsic::amdgcn_class); return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; } @@ -2730,7 +2875,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) .addReg(CondReg) - .addReg(Exec); + .addReg(Exec) + .setOperandDead(3); // Dead scc CondReg = TmpReg; } @@ -2793,7 +2939,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { !CanCopyLow32 && !CanCopyHi32) { auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) .addReg(SrcReg) - .addReg(MaskReg); + .addReg(MaskReg) + .setOperandDead(3); // Dead scc I.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } @@ -2816,9 +2963,12 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { assert(MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"); - BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) + auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) .addReg(SrcReg) .addReg(MaskReg); + + if (!IsVGPR) + NewOp.setOperandDead(3); // Dead scc I.eraseFromParent(); return true; } @@ -3252,7 +3402,7 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { unsigned Opc; - switch (MI.getIntrinsicID()) { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; break; @@ -3324,7 +3474,8 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) .addReg(SrcReg) - .addImm(Subtarget->getWavefrontSizeLog2()); + .addImm(Subtarget->getWavefrontSizeLog2()) + .setOperandDead(3); // Dead scc } const TargetRegisterClass &RC = @@ -3336,6 +3487,33 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } +bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { + Register SrcReg = MI.getOperand(0).getReg(); + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); + Register SP = + Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); + Register WaveAddr = getWaveAddress(DefMI); + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!WaveAddr) { + WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) + .addReg(SrcReg) + .addImm(Subtarget->getWavefrontSizeLog2()) + .setOperandDead(3); // Dead scc + } + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) + .addReg(WaveAddr); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3402,11 +3580,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_INSERT: return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_CONVERGENT: return selectG_INTRINSIC(I); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I); case TargetOpcode::G_ICMP: - if (selectG_ICMP(I)) + case TargetOpcode::G_FCMP: + if (selectG_ICMP_or_FCMP(I)) return true; return selectImpl(I, *CoverageInfo); case TargetOpcode::G_LOAD: @@ -3443,6 +3624,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { selectImpl(I, *CoverageInfo)) return true; return selectG_SZA_EXT(I); + case TargetOpcode::G_FPEXT: + if (selectG_FPEXT(I)) + return true; + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_GLOBAL_VALUE: @@ -3457,8 +3642,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { - const AMDGPU::ImageDimIntrinsicInfo *Intr - = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); + const AMDGPU::ImageDimIntrinsicInfo *Intr = + AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I)); assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } @@ -3472,6 +3657,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return true; case AMDGPU::G_AMDGPU_WAVE_ADDRESS: return selectWaveAddress(I); + case AMDGPU::G_STACKRESTORE: + return selectStackRestore(I); default: return selectImpl(I, *CoverageInfo); } @@ -3916,7 +4103,9 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, int64_t ConstOffset; std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant)) + + if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg()))) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -4079,7 +4268,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); - if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) && + if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { Addr = PtrBase; @@ -4113,7 +4302,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) .addFrameIndex(FI) - .addReg(RHSDef->Reg); + .addReg(RHSDef->Reg) + .setOperandDead(3); // Dead scc } } @@ -4155,6 +4345,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + Register OrigAddr = Addr; if (ConstOffset != 0 && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { Addr = PtrBase; @@ -4172,8 +4363,13 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); - if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS)) - return std::nullopt; + if (OrigAddr != Addr) { + if (!isFlatScratchBaseLegalSVImm(OrigAddr)) + return std::nullopt; + } else { + if (!isFlatScratchBaseLegalSV(OrigAddr)) + return std::nullopt; + } if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; @@ -4306,14 +4502,78 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, return KB->signBitIsZero(Base); } -bool AMDGPUInstructionSelector::isFlatScratchBaseLegal( - Register Base, uint64_t FlatVariant) const { - if (FlatVariant != SIInstrFlags::FlatScratch) +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(MachineInstr *Addr) { + return Addr->getOpcode() == TargetOpcode::G_OR || + (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && + Addr->getFlag(MachineInstr::NoUWrap)); +} + +// Check that the base address of flat scratch load/store in the form of `base + +// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware +// requirement). We always treat the first operand as the base address here. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) return true; - // When value in 32-bit Base can be negative calculate scratch offset using - // 32-bit add instruction, otherwise use Base(unsigned) + offset. - return KB->signBitIsZero(Base); + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + + if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { + std::optional<ValueAndVReg> RhsValReg = + getIConstantVRegValWithLookThrough(RHS, *MRI); + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && + RhsValReg->Value.getSExtValue() > -0x40000000) + return true; + } + + return KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) + return true; + + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR + Imm. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( + Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + Register Base = AddrMI->getOperand(1).getReg(); + std::optional<DefinitionAndSourceRegister> BaseDef = + getDefSrcRegIgnoringCopies(Base, *MRI); + std::optional<ValueAndVReg> RHSOffset = + getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); + assert(RHSOffset); + + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (isNoUnsignedWrap(BaseDef->MI) && + (isNoUnsignedWrap(AddrMI) || + (RHSOffset->Value.getSExtValue() < 0 && + RHSOffset->Value.getSExtValue() > -0x40000000))) + return true; + + Register LHS = BaseDef->MI->getOperand(1).getReg(); + Register RHS = BaseDef->MI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); } bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, @@ -4332,21 +4592,18 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; } -// Return the wave level SGPR base address if this is a wave address. -static Register getWaveAddress(const MachineInstr *Def) { - return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS - ? Def->getOperand(1).getReg() - : Register(); -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { Register Reg = Root.getReg(); const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - const MachineInstr *Def = MRI->getVRegDef(Reg); - if (Register WaveBase = getWaveAddress(Def)) { + std::optional<DefinitionAndSourceRegister> Def = + getDefSrcRegIgnoringCopies(Reg, *MRI); + assert(Def && "this shouldn't be an optional result"); + Reg = Def->Reg; + + if (Register WaveBase = getWaveAddress(Def->MI)) { return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); @@ -4362,10 +4619,12 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( // FIXME: Copy check is a hack Register BasePtr; - if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { + if (mi_match(Reg, *MRI, + m_GPtrAdd(m_Reg(BasePtr), + m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) { if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; - const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); + MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI); Register WaveBase = getWaveAddress(BasePtrDef); if (!WaveBase) return {}; @@ -4818,8 +5077,8 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { // an immediate offset. Register SOffset; unsigned Offset; - std::tie(SOffset, Offset) = - AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB); + std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset( + *MRI, Root.getReg(), KB, /*CheckNUW*/ true); if (!SOffset) return std::nullopt; @@ -5057,7 +5316,16 @@ void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { - MIB.addFrameIndex((MI.getOperand(1).getIndex())); + MIB.addFrameIndex(MI.getOperand(1).getIndex()); +} + +void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF(); + int ExpVal = APF.getExactLog2Abs(); + assert(ExpVal != INT_MIN); + MIB.addImm(ExpVal); } bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { |
