diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 770 |
1 files changed, 572 insertions, 198 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b7d0f0580cda..3f242fdb6d8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -80,8 +81,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); if (RC) { const LLT Ty = MRI.getType(Reg); - return RC->hasSuperClassEq(TRI.getBoolRC()) && - Ty.isValid() && Ty.getSizeInBits() == 1; + if (!Ty.isValid() || Ty.getSizeInBits() != 1) + return false; + // G_TRUNC s1 result is never vcc. + return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && + RC->hasSuperClassEq(TRI.getBoolRC()); } const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); @@ -91,7 +95,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const { MI.setDesc(TII.get(NewOpc)); - MI.RemoveOperand(1); // Remove intrinsic ID. + MI.removeOperand(1); // Remove intrinsic ID. MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); MachineOperand &Dst = MI.getOperand(0); @@ -216,7 +220,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); - DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; @@ -454,6 +458,24 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( return true; } +bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( + MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + + unsigned Opc; + if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 + : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + I.setDesc(TII.get(Opc)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + I.addImplicitDefUseOperands(*MF); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + // TODO: We should probably legalize these to only using 32-bit results. bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); @@ -481,7 +503,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); if (!SrcRC) return false; unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, @@ -514,7 +536,7 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const unsigned DstSize = DstTy.getSizeInBits(); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); if (!DstRC) return false; @@ -556,7 +578,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; @@ -630,7 +652,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { MI.setDesc(TII.get(AMDGPU::COPY)); - MI.RemoveOperand(2); + MI.removeOperand(2); return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); } @@ -643,6 +665,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( // // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) // => (S_PACK_HH_B32_B16 $src0, $src1) + // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1) + // => (S_PACK_HL_B32_B16 $src0, $src1) // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) // => (S_PACK_LH_B32_B16 $src0, $src1) // (build_vector_trunc $src0, $src1) @@ -662,14 +686,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { - // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 - auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) - .addReg(ShiftSrc0) - .addImm(16); + } else if (Shift0) { + if (ConstSrc1 && ConstSrc1->Value == 0) { + // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 + auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) + .addReg(ShiftSrc0) + .addImm(16); - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + } + if (STI.hasSPackHL()) { + Opc = AMDGPU::S_PACK_HL_B32_B16; + MI.getOperand(1).setReg(ShiftSrc0); + } } MI.setDesc(TII.get(Opc)); @@ -722,16 +752,16 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); if (!DstRC) return false; const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); const TargetRegisterClass *Src0RC = - TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); const TargetRegisterClass *Src1RC = - TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); + TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); // Deal with weird cases where the class only partially supports the subreg // index. @@ -970,6 +1000,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectGroupStaticSize(I); case Intrinsic::returnaddress: return selectReturnAddress(I); + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + return selectSMFMACIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -1142,7 +1179,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { Optional<ValueAndVReg> Arg = getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); - if (Arg.hasValue()) { + if (Arg) { const int64_t Value = Arg.getValue().Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; @@ -1164,8 +1201,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); - const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); + const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; @@ -1300,12 +1336,14 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); unsigned Offset0 = OrderedCountIndex << 2; - unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | - (Instruction << 4); + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) Offset1 |= (CountDw - 1) << 6; + if (STI.getGeneration() < AMDGPUSubtarget::GFX11) + Offset1 |= ShaderType << 2; + unsigned Offset = Offset0 | (Offset1 << 8); Register M0Val = MI.getOperand(2).getReg(); @@ -1424,23 +1462,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - - if (STI.needsAlignedVGPRs()) { - // Add implicit aligned super-reg to force alignment on the data operand. - Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); - Register NewVR = - MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); - BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) - .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) - .addImm(AMDGPU::sub0) - .addReg(Undef) - .addImm(AMDGPU::sub1); - MIB.addReg(NewVR, 0, AMDGPU::sub0); - MIB.addReg(NewVR, RegState::Implicit); - } else { - MIB.addReg(VSrc); - } + MIB.addReg(VSrc); if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) return false; @@ -1449,6 +1471,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, MIB.addImm(ImmOffset) .cloneMemRefs(MI); + TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); + MI.eraseFromParent(); return true; } @@ -1523,6 +1547,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; @@ -1627,7 +1652,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } // The legalizer preprocessed the intrinsic arguments. If we aren't using - // NSA, these should have beeen packed into a single value in the first + // NSA, these should have been packed into a single value in the first // address register const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { @@ -1639,13 +1664,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( ++NumVDataDwords; int Opcode = -1; - if (IsGFX10Plus) { + if (IsGFX11Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx11Default, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); } else { - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->hasGFX90AInsts()) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) { + LLVM_DEBUG( + dbgs() + << "requested image instruction is not supported on this GPU\n"); + return false; + } + } + if (Opcode == -1 && + STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) @@ -1703,7 +1744,13 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (IsGFX10Plus) MIB.addImm(IsA16 ? -1 : 0); - MIB.addImm(TFE); // tfe + if (!Subtarget->hasGFX90AInsts()) { + MIB.addImm(TFE); // tfe + } else if (TFE) { + LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n"); + return false; + } + MIB.addImm(LWE); // lwe if (!IsGFX10Plus) MIB.addImm(DimInfo->DA ? -1 : 0); @@ -1743,7 +1790,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); + return true; } bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( @@ -1770,10 +1819,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); - default: { - return selectImpl(I, *CoverageInfo); - } + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: + return selectBufferLoadLds(I); + case Intrinsic::amdgcn_global_load_lds: + return selectGlobalLoadLds(I); + case Intrinsic::amdgcn_exp_compr: + if (!STI.hasCompressedExport()) { + Function &F = I.getMF()->getFunction(); + DiagnosticInfoUnsupported NoFpRet( + F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + break; } + return selectImpl(I, *CoverageInfo); } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { @@ -1872,10 +1933,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); - const TargetRegisterClass *SrcRC - = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); - const TargetRegisterClass *DstRC - = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstRB); if (!SrcRC || !DstRC) return false; @@ -2014,10 +2075,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return selectCOPY(I); const TargetRegisterClass *SrcRC = - TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); + TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); Register UndefReg = MRI->createVirtualRegister(SrcRC); BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); @@ -2384,65 +2445,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( return selectImpl(I, *CoverageInfo); } -// TODO: No rtn optimization. -bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( - MachineInstr &MI) const { - Register PtrReg = MI.getOperand(1).getReg(); - const LLT PtrTy = MRI->getType(PtrReg); - if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - STI.useFlatForGlobal()) - return selectImpl(MI, *CoverageInfo); - - Register DstReg = MI.getOperand(0).getReg(); - const LLT Ty = MRI->getType(DstReg); - const bool Is64 = Ty.getSizeInBits() == 64; - const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - Register TmpReg = MRI->createVirtualRegister( - Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); - - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock *BB = MI.getParent(); - - Register VAddr, RSrcReg, SOffset; - int64_t Offset = 0; - - unsigned Opcode; - if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { - Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; - } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, - RSrcReg, SOffset, Offset)) { - Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; - } else - return selectImpl(MI, *CoverageInfo); - - auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) - .addReg(MI.getOperand(2).getReg()); - - if (VAddr) - MIB.addReg(VAddr); - - MIB.addReg(RSrcReg); - if (SOffset) - MIB.addReg(SOffset); - else - MIB.addImm(0); - - MIB.addImm(Offset); - MIB.addImm(AMDGPU::CPol::GLC); - MIB.cloneMemRefs(MI); - - BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) - .addReg(TmpReg, RegState::Kill, SubReg); - - MI.eraseFromParent(); - - MRI->setRegClass( - DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { if (Reg.isPhysical()) return false; @@ -2551,7 +2553,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { // Try to avoid emitting a bit operation when we only need to touch half of // the 64-bit pointer. - APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); + APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64); const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); @@ -2571,12 +2573,10 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { const TargetRegisterClass &RegRC = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; - const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, - *MRI); - const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, - *MRI); + const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); + const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); const TargetRegisterClass *MaskRC = - TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); + TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || @@ -2689,10 +2689,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( if (IdxRB->getID() != AMDGPU::SGPRRegBankID) return false; - const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, - *MRI); - const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, - *MRI); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); + const TargetRegisterClass *DstRC = + TRI.getRegClassForTypeOnBank(DstTy, *DstRB); if (!SrcRC || !DstRC) return false; if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || @@ -2771,10 +2771,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( if (IdxRB->getID() != AMDGPU::SGPRRegBankID) return false; - const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, - *MRI); - const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, - *MRI); + const TargetRegisterClass *VecRC = + TRI.getRegClassForTypeOnBank(VecTy, *VecRB); + const TargetRegisterClass *ValRC = + TRI.getRegClassForTypeOnBank(ValTy, *ValRB); if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || @@ -2867,7 +2867,6 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( return false; assert(ShufMask.size() == 2); - assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2924,17 +2923,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( } } else if (Mask[0] == 0 && Mask[1] == 0) { if (IsVALU) { - // Write low half of the register into the high half. - MachineInstr *MovSDWA = - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) - .addImm(0) // $src0_modifiers - .addReg(SrcVec) // $src0 - .addImm(0) // $clamp - .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel - .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused - .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel - .addReg(SrcVec, RegState::Implicit); - MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + if (STI.hasSDWA()) { + // Write low half of the register into the high half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) + .addImm(0xFFFF) + .addReg(SrcVec); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg) + .addReg(TmpReg) + .addImm(16) + .addReg(TmpReg); + } } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) .addReg(SrcVec) @@ -2942,17 +2952,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( } } else if (Mask[0] == 1 && Mask[1] == 1) { if (IsVALU) { - // Write high half of the register into the low half. - MachineInstr *MovSDWA = - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) - .addImm(0) // $src0_modifiers - .addReg(SrcVec) // $src0 - .addImm(0) // $clamp - .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel - .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused - .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel - .addReg(SrcVec, RegState::Implicit); - MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + if (STI.hasSDWA()) { + // Write high half of the register into the low half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .addReg(SrcVec); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg) + .addReg(TmpReg) + .addImm(16) + .addReg(TmpReg); + } } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) .addReg(SrcVec) @@ -2965,13 +2986,19 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( .addReg(SrcVec) .addImm(16); } else { - Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) - .addReg(SrcVec) - .addImm(16); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) - .addReg(TmpReg) - .addReg(SrcVec); + if (STI.hasSPackHL()) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HL_B32_B16), DstReg) + .addReg(SrcVec) + .addReg(SrcVec); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) + .addReg(SrcVec) + .addImm(16); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) + .addReg(TmpReg) + .addReg(SrcVec); + } } } else llvm_unreachable("all shuffle masks should be handled"); @@ -2982,13 +3009,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( MachineInstr &MI) const { - if (STI.hasGFX90AInsts()) + const Register DefReg = MI.getOperand(0).getReg(); + LLT DefTy = MRI->getType(DefReg); + if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) return selectImpl(MI, *CoverageInfo); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + if (!MRI->use_nodbg_empty(DefReg)) { Function &F = MBB->getParent()->getFunction(); DiagnosticInfoUnsupported NoFpRet(F, "return versions of fp atomics not supported", @@ -3105,9 +3134,236 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == 9; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(4).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Optional<ValueAndVReg> MaybeVOffset = + getIConstantVRegValWithLookThrough(VOffset, *MRI); + const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); + + switch (Size) { + default: + return false; + case 1: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; + break; + case 2: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; + break; + case 4: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); + + if (HasVIndex && HasVOffset) { + Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) + .addReg(VIndex) + .addImm(AMDGPU::sub0) + .addReg(VOffset) + .addImm(AMDGPU::sub1); + + MIB.addReg(IdxReg); + } else if (HasVIndex) { + MIB.addReg(VIndex); + } else if (HasVOffset) { + MIB.addReg(VOffset); + } + + MIB.add(MI.getOperand(1)); // rsrc + MIB.add(MI.getOperand(5 + OpOffset)); // soffset + MIB.add(MI.getOperand(6 + OpOffset)); // imm offset + unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); + MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol + MIB.addImm((Aux >> 3) & 1); // swz + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + StorePtrI.V = nullptr; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), LoadMMO->getBaseAlign()); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + +/// Match a zero extend from a 32-bit value to 64-bits. +static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { + Register ZExtSrc; + if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) + const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return false; + + if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + return Def->getOperand(1).getReg(); + } + + return Register(); +} + +bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + switch (Size) { + default: + return false; + case 1: + Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; + break; + case 2: + Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; + break; + case 4: + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + Register Addr = MI.getOperand(1).getReg(); + Register VOffset; + // Try to split SAddr and VOffset. Global and LDS pointers share the same + // immediate offset, so we cannot use a regular SelectGlobalSAddr(). + if (!isSGPR(Addr)) { + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); + if (isSGPR(AddrDef->Reg)) { + Addr = AddrDef->Reg; + } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { + Register SAddr = + getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + if (SAddr && isSGPR(SAddr)) { + Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + Addr = SAddr; + VOffset = Off; + } + } + } + } + + if (isSGPR(Addr)) { + Opc = AMDGPU::getGlobalSaddrOp(Opc); + if (!VOffset) { + VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) + .addImm(0); + } + } + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) + .addReg(Addr); + + if (isSGPR(Addr)) + MIB.addReg(VOffset); + + MIB.add(MI.getOperand(4)) // offset + .add(MI.getOperand(5)); // cpol + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(4).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), Align(4)); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ MI.setDesc(TII.get(MI.getOperand(1).getImm())); - MI.RemoveOperand(1); + MI.removeOperand(1); + MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + return true; +} + +bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { + unsigned Opc; + switch (MI.getIntrinsicID()) { + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; + break; + default: + llvm_unreachable("unhandled smfmac intrinsic"); + } + + auto VDst_In = MI.getOperand(4); + + MI.setDesc(TII.get(Opc)); + MI.removeOperand(4); // VDst_In + MI.removeOperand(1); // Intrinsic ID + MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); return true; } @@ -3166,6 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_UADDE: case TargetOpcode::G_USUBE: return selectG_UADDO_USUBO_UADDE_USUBE(I); + case AMDGPU::G_AMDGPU_MAD_U64_U32: + case AMDGPU::G_AMDGPU_MAD_I64_I32: + return selectG_AMDGPU_MAD_64_32(I); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: case TargetOpcode::G_PTRTOINT: @@ -3226,8 +3485,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - return selectG_AMDGPU_ATOMIC_CMPXCHG(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_TRUNC: @@ -3286,9 +3543,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair<Register, unsigned> -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool AllowAbs) const { +std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( + MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const { Register Src = Root.getReg(); Register OrigSrc = Src; unsigned Mods = 0; @@ -3305,7 +3561,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, Mods |= SISrcMods::ABS; } - if (Mods != 0 && + if (OpSel) + Mods |= SISrcMods::OP_SEL_0; + + if ((Mods != 0 || ForceVGPR) && RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { MachineInstr *UseMI = Root.getParent(); @@ -3407,7 +3666,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; MachineInstr *MI = MRI.getVRegDef(Src); @@ -3421,6 +3680,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( } // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; @@ -3444,6 +3704,50 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= SISrcMods::NEG; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( + MachineOperand &Root) const { + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() != 0) + Mods |= SISrcMods::OP_SEL_0; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; unsigned Mods; @@ -3467,6 +3771,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ false, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ true, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector<GEPInfo, 4> AddrInfo; getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); @@ -3594,24 +3928,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { }}; } -/// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { - Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); - - // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); - if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; - - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { - return Def->getOperand(1).getReg(); - } - - return Register(); -} - // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { @@ -3631,9 +3947,6 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { ImmOffset = ConstOffset; } else { auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); - if (!PtrBaseDef) - return None; - if (isSGPR(PtrBaseDef->Reg)) { if (ConstOffset > 0) { // Offset is too large. @@ -3679,11 +3992,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } } - auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); - if (!AddrDef) - return None; - // Match the variable offset. + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { // Look through the SGPR->VGPR copy. Register SAddr = @@ -3749,9 +4059,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { } auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); - if (!AddrDef) - return None; - if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = AddrDef->MI->getOperand(1).getIndex(); return {{ @@ -3768,8 +4075,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); - if (LHSDef && RHSDef && - LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && isSGPR(RHSDef->Reg)) { int FI = LHSDef->MI->getOperand(1).getIndex(); MachineInstr &I = *Root.getParent(); @@ -3792,6 +4098,74 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { }}; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( + Register VAddr, Register SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + auto VKnown = KnownBits->getKnownBits(VAddr); + auto SKnown = KnownBits::computeForAddSub( + true, false, KnownBits->getKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { + Register Addr = Root.getReg(); + Register PtrBase; + int64_t ConstOffset; + int64_t ImmOffset = 0; + + // Match the immediate offset first, which canonically is moved as low as + // possible. + std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + + if (ConstOffset != 0 && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + Addr = PtrBase; + ImmOffset = ConstOffset; + } + + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); + if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) + return None; + + Register RHS = AddrDef->MI->getOperand(2).getReg(); + if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) + return None; + + Register LHS = AddrDef->MI->getOperand(1).getReg(); + auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) + return None; + + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { + int FI = LHSDef->MI->getOperand(1).getIndex(); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; + } + + if (!isSGPR(LHS)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); @@ -3856,7 +4230,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MIB.addReg(Info->getScratchRSrcReg()); }, [=](MachineInstrBuilder &MIB) { // vaddr - if (FI.hasValue()) + if (FI) MIB.addFrameIndex(FI.getValue()); else MIB.addReg(VAddr); |
