diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
73 files changed, 3023 insertions, 1221 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h index 35d33cb60bc4..36af767a70b0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID; void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &); extern char &AMDGPUGlobalISelDivergenceLoweringID; +void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &); +extern char &AMDGPUMarkLastScratchLoadID; + void initializeSILowerSGPRSpillsPass(PassRegistry &); extern char &SILowerSGPRSpillsID; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td index df8c35ffd457..cb29d5d94759 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -646,6 +646,12 @@ def FeatureFP8Insts : SubtargetFeature<"fp8-insts", "Has fp8 and bf8 instructions" >; +def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts", + "HasFP8ConversionInsts", + "true", + "Has fp8 and bf8 conversion instructions" +>; + def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "HasPkFmacF16Inst", "true", @@ -719,6 +725,18 @@ def FeatureFlatAtomicFaddF32Inst "Has flat_atomic_add_f32 instruction" >; +def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero", + "HasDefaultComponentZero", + "true", + "BUFFER/IMAGE store instructions set unspecified components to zero (before GFX12)" +>; + +def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast", + "HasDefaultComponentBroadcast", + "true", + "BUFFER/IMAGE store instructions set unspecified components to x component (GFX12)" +>; + def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support", "SupportsSRAMECC", "true", @@ -1003,7 +1021,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, - FeatureGDS, FeatureGWS + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1014,7 +1032,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, - FeatureImageInsts, FeatureGDS, FeatureGWS + FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1029,7 +1047,8 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS + FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero ] >; @@ -1047,7 +1066,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS + FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1067,7 +1086,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1087,7 +1106,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS + FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1107,7 +1126,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts + FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast ] >; @@ -1311,6 +1330,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeaturePackedFP32Ops, FeatureMAIInsts, FeatureFP8Insts, + FeatureFP8ConversionInsts, FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, @@ -1467,7 +1487,6 @@ def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, FeatureLDSBankCount32, FeatureDLInsts, - FeatureDot5Insts, FeatureDot7Insts, FeatureDot8Insts, FeatureDot9Insts, @@ -1477,8 +1496,13 @@ def FeatureISAVersion12 : FeatureSet< FeatureWavefrontSize32, FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, + FeatureArchitectedSGPRs, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, @@ -1488,8 +1512,8 @@ def FeatureISAVersion12 : FeatureSet< FeaturePseudoScalarTrans, FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, - FeatureMADIntraFwdBug, - FeatureScalarDwordx3Loads]>; + FeatureScalarDwordx3Loads, + FeatureDPPSrc1SGPR]>; //===----------------------------------------------------------------------===// @@ -1981,6 +2005,9 @@ def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegi def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, AssemblerPredicate<(all_of FeatureFP8Insts)>; +def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">, + AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>; + def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; @@ -2013,6 +2040,13 @@ def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; +def HasDefaultComponentZero + : Predicate<"Subtarget->hasDefaultComponentZero()">, + AssemblerPredicate<(all_of FeatureDefaultComponentZero)>; +def HasDefaultComponentBroadcast + : Predicate<"Subtarget->hasDefaultComponentBroadcast()">, + AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>; + def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index d317a733d433..279ef8ca2751 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -123,8 +123,11 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { getTargetStreamer()->EmitDirectiveAMDGCNTarget(); - if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { + getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( + CodeObjectVersion); HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); + } if (TM.getTargetTriple().getOS() == Triple::AMDPAL) getTargetStreamer()->getPALMetadata()->readFromIR(M); @@ -230,8 +233,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { IsaInfo::getNumExtraSGPRs( &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, getTargetStreamer()->getTargetID()->isXnackOnOrAny()), - CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, - CodeObjectVersion); + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); Streamer.popSection(); } @@ -323,7 +325,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { } bool AMDGPUAsmPrinter::doInitialization(Module &M) { - CodeObjectVersion = AMDGPU::getCodeObjectVersion(M); + CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { @@ -631,8 +633,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', // depending on global target features. This will cover empty modules. - getTargetStreamer()->initializeTargetID( - *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion); + getTargetStreamer()->initializeTargetID(*getGlobalSTI(), + getGlobalSTI()->getFeatureString()); // If module is empty, we are done. if (M.empty()) @@ -981,8 +983,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); OutStreamer->emitInt32( - STM.getGeneration() >= AMDGPUSubtarget::GFX11 - ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + STM.getGeneration() >= AMDGPUSubtarget::GFX12 + ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) + : STM.getGeneration() == AMDGPUSubtarget::GFX11 + ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = @@ -993,8 +997,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); OutStreamer->emitInt32( - STM.getGeneration() >= AMDGPUSubtarget::GFX11 - ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + STM.getGeneration() >= AMDGPUSubtarget::GFX12 + ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) + : STM.getGeneration() == AMDGPUSubtarget::GFX11 + ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 5fd9e571282d..d7f5110427ec 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -144,7 +144,7 @@ public: BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC, TargetMachine &TM) : InformationCache(M, AG, Allocator, CGSCC), TM(TM), - CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {} + CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {} TargetMachine &TM; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index cf2896f80f19..6d05c3678bf0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -474,7 +474,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, const Module *M = MF.getFunction().getParent(); if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { + AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -632,10 +632,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( const bool InReg = Arg.hasAttribute(Attribute::InReg); - // SGPR arguments to functions not implemented. - if (!IsGraphics && InReg) - return false; - if (Arg.hasAttribute(Attribute::SwiftSelf) || Arg.hasAttribute(Attribute::SwiftError) || Arg.hasAttribute(Attribute::Nest)) @@ -719,6 +715,10 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!IsEntryFunc && !IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + + if (!Subtarget.enableFlatScratch()) + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } IncomingValueAssigner Assigner(AssignFn); @@ -732,13 +732,8 @@ bool AMDGPUCallLowering::lowerFormalArguments( uint64_t StackSize = Assigner.StackSize; // Start adding system SGPRs. - if (IsEntryFunc) { + if (IsEntryFunc) TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); - } else { - if (!Subtarget.enableFlatScratch()) - CCInfo.AllocateReg(Info->getScratchRSrcReg()); - TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); - } // When we tail call, we need to check if the callee's arguments will fit on // the caller's stack. So, whenever we lower formal arguments, we should keep diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40..a19b03b92923 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,9 +51,9 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; -def gi_dotiuvop3pmods : - GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">, - GIComplexPatternEquiv<DotIUVOP3PMods>; +def gi_vop3pmodsneg : + GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">, + GIComplexPatternEquiv<VOP3PModsNeg>; def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, @@ -261,10 +261,16 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; +def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>; +def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>; +def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>; +def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>; def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>; def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>; @@ -379,8 +385,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">, def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, GISDNodeXFormEquiv<extract_swz>; -def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">, - GISDNodeXFormEquiv<set_glc>; +def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">, + GISDNodeXFormEquiv<extract_cpol_set_glc>; def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv<frameindex_to_targetframeindex>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 74e9cd7d0965..186fa58524b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -532,7 +532,8 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF, Func.getCallingConv() != CallingConv::SPIR_KERNEL) return; - auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent()); + auto CodeObjectVersion = + AMDGPU::getAMDHSACodeObjectVersion(*Func.getParent()); auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion); auto Kernels = diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 41462d7a133e..4c35649cec6c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1159,7 +1159,7 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative // values. - if (AMDGPU::isGFX12Plus(*Subtarget)) + if (Subtarget->hasSignedScratchOffsets()) return true; auto LHS = Addr.getOperand(0); @@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { if (isNoUnsignedWrap(Addr)) return true; + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (Subtarget->hasSignedScratchOffsets()) + return true; + auto LHS = Addr.getOperand(0); auto RHS = Addr.getOperand(1); return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); @@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { // Check address value in SGPR/VGPR are legal for flat scratch in the form // of: SGPR + VGPR + Imm. bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const { + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (AMDGPU::isGFX12Plus(*Subtarget)) + return true; + auto Base = Addr.getOperand(0); auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1)); // If the immediate offset is negative and within certain range, the base @@ -3009,7 +3019,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } -bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const { +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); // Literal i1 value set in intrinsic, represents SrcMods for the next operand. // 1 promotes packed values to signed, 0 treats them as unsigned. @@ -3183,13 +3193,14 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { return !AllUsesAcceptSReg && (Limit < 10); } -bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { +bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { auto Ld = cast<LoadSDNode>(N); - if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand())) + const MachineMemOperand *MMO = Ld->getMemOperand(); + if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO)) return false; - return Ld->getAlign() >= Align(4) && + return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) && ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) || (Subtarget->getScalarizeGlobalBehavior() && diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index df4a211d42a0..8645490f0b16 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -237,7 +237,7 @@ private: bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0dbcaf5a1b13..55d95154c758 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -446,6 +446,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i64, Custom); + for (auto VT : {MVT::i8, MVT::i16}) + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; @@ -784,6 +787,7 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, unsigned AS = MN->getAddressSpace(); // Do not shrink an aligned scalar load to sub-dword. // Scalar engine cannot do sub-dword loads. + // TODO: Update this for GFX12 which does have scalar sub-dword loads. if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || @@ -1397,6 +1401,11 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) Results.push_back(Lowered); return; + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: + if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) + Results.push_back(Lowered); + return; default: return; } @@ -3062,6 +3071,26 @@ static bool isCttzOpc(unsigned Opc) { return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; } +SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, + SelectionDAG &DAG) const { + auto SL = SDLoc(Op); + auto Arg = Op.getOperand(0u); + auto ResultVT = Op.getValueType(); + + if (ResultVT != MVT::i8 && ResultVT != MVT::i16) + return {}; + + assert(isCtlzOpc(Op.getOpcode())); + assert(ResultVT == Arg.getValueType()); + + auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits(); + auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); + auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32); + NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal); + NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp); + return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); +} + SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -5453,6 +5482,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(SBUFFER_LOAD) + NODE_NAME_CASE(SBUFFER_LOAD_BYTE) + NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) + NODE_NAME_CASE(SBUFFER_LOAD_SHORT) + NODE_NAME_CASE(SBUFFER_LOAD_USHORT) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_BYTE) NODE_NAME_CASE(BUFFER_STORE_SHORT) @@ -5473,8 +5506,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b551..f10a357125e5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -84,6 +84,8 @@ protected: SDNodeFlags Flags) const; SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -567,6 +569,10 @@ enum NodeType : unsigned { BUFFER_LOAD_FORMAT_TFE, BUFFER_LOAD_FORMAT_D16, SBUFFER_LOAD, + SBUFFER_LOAD_BYTE, + SBUFFER_LOAD_UBYTE, + SBUFFER_LOAD_SHORT, + SBUFFER_LOAD_USHORT, BUFFER_STORE, BUFFER_STORE_BYTE, BUFFER_STORE_SHORT, @@ -587,8 +593,10 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_FADD_BF16, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, + BUFFER_ATOMIC_COND_SUB_U32, LAST_AMDGPU_ISD_NUMBER }; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 2bb7b6bd0674..898289019c71 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, return DemandedElts; } +// Trim elements of the end of the vector \p V, if they are +// equal to the first element of the vector. +static APInt defaultComponentBroadcast(Value *V) { + auto *VTy = cast<FixedVectorType>(V->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + Value *FirstComponent = findScalarElement(V, 0); + + SmallVector<int> ShuffleMask; + if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) + SVI->getShuffleMask(ShuffleMask); + + for (int I = VWidth - 1; I > 0; --I) { + if (ShuffleMask.empty()) { + auto *Elt = findScalarElement(V, I); + if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt))) + break; + } else { + // Detect identical elements in the shufflevector result, even though + // findScalarElement cannot tell us what that element is. + if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) + break; + } + DemandedElts.clearBit(I); + } + + return DemandedElts; +} + static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, @@ -1140,8 +1169,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!isa<FixedVectorType>(II.getArgOperand(0)->getType())) break; - APInt DemandedElts = - trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + APInt DemandedElts; + if (ST->hasDefaultComponentBroadcast()) + DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); + else if (ST->hasDefaultComponentZero()) + DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + else + break; int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ad8dcda93c36..fdee74d58d26 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return false; int NumVAddrRegs = 0; @@ -3927,7 +3928,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { // Literal i1 value set in intrinsic, represents SrcMods for the next operand. // Value is in Imm operand as i1 sign extended to int64_t. // 1(-1) promotes packed values to signed, 0 treats them as unsigned. @@ -4556,7 +4557,7 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative // values. - if (AMDGPU::isGFX12Plus(STI)) + if (STI.hasSignedScratchOffsets()) return true; Register LHS = AddrMI->getOperand(1).getReg(); @@ -4585,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { if (isNoUnsignedWrap(AddrMI)) return true; + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (STI.hasSignedScratchOffsets()) + return true; + Register LHS = AddrMI->getOperand(1).getReg(); Register RHS = AddrMI->getOperand(2).getReg(); return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); @@ -4594,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { // of: SGPR + VGPR + Imm. bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( Register Addr) const { + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (STI.hasSignedScratchOffsets()) + return true; + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); Register Base = AddrMI->getOperand(1).getReg(); std::optional<DefinitionAndSourceRegister> BaseDef = @@ -5411,6 +5422,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst( I.eraseFromParent(); return true; } + bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); @@ -5496,11 +5508,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, MIB.addImm(Swizzle); } -void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderExtractCpolSetGLC( + MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC); + const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & + (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + MIB.addImm(Cpol | AMDGPU::CPol::GLC); } void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index ab7cc0a6beb8..12ea46c2895b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -194,7 +194,7 @@ private: selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns - selectDotIUVOP3PMods(MachineOperand &Root) const; + selectVOP3PModsNeg(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; @@ -331,8 +331,8 @@ private: int OpIdx) const; void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; - void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; + void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 36e07d944c94..360aafedc522 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -647,6 +647,9 @@ defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; +defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; +defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; +defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { let HasNoUse = true in diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index fb7148ba10ac..69fdeaebe0a0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/InitializePasses.h" @@ -58,6 +60,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<UniformityInfoWrapperPass>(); AU.setPreservesAll(); @@ -90,6 +93,12 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; + const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + if (ST.hasScalarSubwordLoads()) + return false; + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); @@ -179,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index aa235c07e995..8e74d4c0e945 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2139,7 +2139,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); // For code object version 5, private_base and shared_base are passed through // implicit kernargs. - if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= + if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= AMDGPU::AMDHSA_COV5) { AMDGPUTargetLowering::ImplicitParameter Param = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE @@ -5883,6 +5883,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: @@ -5893,6 +5896,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6090,6 +6096,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); LLT Ty = MRI->getType(VData); + const bool IsAtomicPacked16Bit = + (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + // Check for 16 bit addresses and pack if true. LLT GradTy = MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); @@ -6098,7 +6108,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const bool IsG16 = ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; const bool IsA16 = AddrTy == S16; - const bool IsD16 = Ty.getScalarType() == S16; + const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { @@ -6140,7 +6150,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( LLT Ty = MRI->getType(VData0); // TODO: Allow atomic swap and bit ops for v2s16/v4s16 - if (Ty.isVector()) + if (Ty.isVector() && !IsAtomicPacked16Bit) return false; if (BaseOpcode->AtomicX2) { @@ -6276,9 +6286,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (NumElts > 4 || DMaskLanes > 4) return false; + // Image atomic instructions are using DMask to specify how many bits + // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). + // DMaskLanes for image atomic has default value '0'. + // We must be sure that atomic variants (especially packed) will not be + // truncated from v2s16 or v4s16 to s16 type. + // + // ChangeElementCount will be needed for image load where Ty is always scalar. const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; const LLT AdjustedTy = - Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); + DMaskLanes == 0 + ? Ty + : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); // The raw dword aligned data component of the load. The only legal cases // where this matters should be when using the packed D16 format, for @@ -6443,15 +6462,28 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return true; } -bool AMDGPULegalizerInfo::legalizeSBufferLoad( - LegalizerHelper &Helper, MachineInstr &MI) const { +bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, + MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; GISelChangeObserver &Observer = Helper.Observer; - Register Dst = MI.getOperand(0).getReg(); - LLT Ty = B.getMRI()->getType(Dst); + Register OrigDst = MI.getOperand(0).getReg(); + Register Dst; + LLT Ty = B.getMRI()->getType(OrigDst); unsigned Size = Ty.getSizeInBits(); MachineFunction &MF = B.getMF(); + unsigned Opc = 0; + if (Size < 32 && ST.hasScalarSubwordLoads()) { + assert(Size == 8 || Size == 16); + Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE + : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; + // The 8-bit and 16-bit scalar buffer load instructions have 32-bit + // destination register. + Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); + } else { + Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; + Dst = OrigDst; + } Observer.changingInstr(MI); @@ -6469,19 +6501,24 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // FIXME: We don't really need this intermediate instruction. The intrinsic // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. - MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); + MI.setDesc(B.getTII().get(Opc)); MI.removeOperand(1); // Remove intrinsic ID // FIXME: When intrinsic definition is fixed, this should have an MMO already. // TODO: Should this use datalayout alignment? const unsigned MemSize = (Size + 7) / 8; - const Align MemAlign(4); + const Align MemAlign(std::min(MemSize, 4u)); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, MemSize, MemAlign); MI.addMemOperand(MF, MMO); + if (Dst != OrigDst) { + MI.getOperand(0).setReg(Dst); + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + B.buildTrunc(OrigDst, Dst); + } // If we don't have 96-bit result scalar loads, widening to 128-bit should // always be legal. We may need to restore this to a 96-bit result if it turns @@ -6545,7 +6582,7 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( Register SGPR01(AMDGPU::SGPR0_SGPR1); // For code object version 5, queue_ptr is passed through implicit kernarg. - if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= + if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= AMDGPU::AMDHSA_COV5) { AMDGPUTargetLowering::ImplicitParameter Param = AMDGPUTargetLowering::QUEUE_PTR; @@ -7080,6 +7117,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::trap: return legalizeTrapIntrinsic(MI, MRI, B); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index c32303defe7f..015c71080d67 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -106,7 +106,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { LLVMContext &Ctx = F.getParent()->getContext(); const DataLayout &DL = F.getParent()->getDataLayout(); BasicBlock &EntryBlock = *F.begin(); - IRBuilder<> Builder(&*getInsertPt(EntryBlock)); + IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); const Align KernArgBaseAlign(16); // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); @@ -202,6 +202,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. + // TODO: Update this for GFX12 which does have scalar sub-dword loads. // // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 097722157d41..bf7f67c086f2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -323,7 +323,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { bool MadeChange = false; - bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; + bool IsV5OrAbove = + AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. @@ -356,7 +357,7 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { PreservedAnalyses AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { bool IsV5OrAbove = - AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; + AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index d90fcac87540..289c35e11beb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1721,7 +1721,7 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, unsigned CodeSourceRegister, bool IsUndefIfSource) { // If this is the function exit block, we don't need a phi. - if (MergeBB->succ_begin() == MergeBB->succ_end()) { + if (MergeBB->succ_empty()) { return; } LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp new file mode 100644 index 000000000000..0692a12a4061 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp @@ -0,0 +1,142 @@ +//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Mark scratch load/spill instructions which are guaranteed to be the last time +// this scratch slot is used so it can be evicted from caches. +// +// TODO: Handle general stack accesses not just spilling. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineOperand.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-mark-last-scratch-load" + +namespace { + +class AMDGPUMarkLastScratchLoad : public MachineFunctionPass { +private: + LiveStacks *LS = nullptr; + LiveIntervals *LIS = nullptr; + SlotIndexes *SI = nullptr; + const SIInstrInfo *SII = nullptr; + +public: + static char ID; + + AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) { + initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<SlotIndexes>(); + AU.addRequired<LiveIntervals>(); + AU.addRequired<LiveStacks>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "AMDGPU Mark Last Scratch Load"; + } +}; + +} // end anonymous namespace + +bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (ST.getGeneration() < AMDGPUSubtarget::GFX12) + return false; + + LS = &getAnalysis<LiveStacks>(); + LIS = &getAnalysis<LiveIntervals>(); + SI = &getAnalysis<SlotIndexes>(); + SII = ST.getInstrInfo(); + SlotIndexes &Slots = *LIS->getSlotIndexes(); + + const unsigned NumSlots = LS->getNumIntervals(); + if (NumSlots == 0) { + LLVM_DEBUG(dbgs() << "No live slots, skipping\n"); + return false; + } + + LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n"); + + bool Changed = false; + + for (auto &[SS, LI] : *LS) { + for (const LiveRange::Segment &Segment : LI.segments) { + + // Ignore segments that run to the end of basic block because in this case + // slot is still live at the end of it. + if (Segment.end.isBlock()) + continue; + + const int FrameIndex = Register::stackSlot2Index(LI.reg()); + MachineInstr *LastLoad = nullptr; + + MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end); + + // If there is no instruction at this slot because it was deleted take the + // instruction from the next slot. + if (!MISegmentEnd) { + SlotIndex NextSlot = Slots.getNextNonNullIndex(Segment.end); + MISegmentEnd = SI->getInstructionFromIndex(NextSlot); + } + + MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start); + MachineBasicBlock *BB = MISegmentEnd->getParent(); + + // Start iteration backwards from segment end until the start of basic + // block or start of segment if it is in the same basic block. + auto End = BB->rend(); + if (MISegmentStart && MISegmentStart->getParent() == BB) + End = MISegmentStart->getReverseIterator(); + + for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) { + int LoadFI = 0; + + if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) { + LastLoad = &*MI; + break; + } + } + + if (LastLoad && !LastLoad->memoperands_empty()) { + MachineMemOperand *MMO = *LastLoad->memoperands_begin(); + MMO->setFlags(MOLastUse); + Changed = true; + LLVM_DEBUG(dbgs() << " Found last load: " << *LastLoad); + } + } + } + + return Changed; +} + +char AMDGPUMarkLastScratchLoad::ID = 0; + +char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID; + +INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE, + "AMDGPU Mark last scratch load", false, false) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(LiveStacks) +INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE, + "AMDGPU Mark last scratch load", false, false) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index bb1d6cb72e80..a1c34e92a57f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -411,6 +411,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; return Width == 16; + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: + MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE}; + return Width == 8; + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: + MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT}; + return Width == 16; } return false; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 391c2b9ec256..bdd4e891f158 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -449,8 +449,13 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { const unsigned AS = MMO->getAddrSpace(); const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + const unsigned MemSize = 8 * MMO->getSize(); + // Require 4-byte alignment. - return MMO->getAlign() >= Align(4) && + return (MMO->getAlign() >= Align(4) || + (Subtarget.hasScalarSubwordLoads() && + ((MemSize == 16 && MMO->getAlign() >= Align(2)) || + (MemSize == 8 && MMO->getAlign() >= Align(1))))) && // Can't do a scalar atomic load. !MMO->isAtomic() && // Don't use scalar loads for volatile accesses to non-constant address @@ -1074,6 +1079,13 @@ bool AMDGPURegisterBankInfo::applyMappingLoad( (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) return false; + if (LoadSize == 32 && + ((MemSize == 8 && MMO->getAlign() >= Align(1)) || + (MemSize == 16 && MMO->getAlign() >= Align(2))) && + isScalarLoadLegal(MI) && + Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12) + return false; + Register PtrReg = MI.getOperand(1).getReg(); ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); @@ -3062,6 +3074,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); @@ -3073,7 +3086,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl( executeInWaterfallLoop(B, MI, {3, 6}); return; } - case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { applyMappingSBufferLoad(B, OpdMapper); return; } @@ -3765,16 +3782,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // properly. // // TODO: There are additional exec masking dependencies to analyze. - if (MI.getOpcode() == TargetOpcode::G_PHI) { + if (auto *PHI = dyn_cast<GPhi>(&MI)) { unsigned ResultBank = AMDGPU::InvalidRegBankID; - Register DstReg = MI.getOperand(0).getReg(); + Register DstReg = PHI->getReg(0); // Sometimes the result may have already been assigned a bank. if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) ResultBank = DstBank->getID(); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { - Register Reg = MI.getOperand(I).getReg(); + for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { + Register Reg = PHI->getIncomingValue(I); const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); // FIXME: Assuming VGPR for any undetermined inputs. @@ -4346,6 +4363,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out @@ -4396,7 +4414,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // initialized. break; } - case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT: + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: { // Lie and claim everything is legal, even though some need to be // SGPRs. applyMapping will have to deal with it as a waterfall loop. OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); @@ -4471,6 +4493,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_fdot2_f32_bf16: case Intrinsic::amdgcn_sudot4: case Intrinsic::amdgcn_sudot8: + case Intrinsic::amdgcn_dot4_f32_fp8_bf8: + case Intrinsic::amdgcn_dot4_f32_bf8_fp8: + case Intrinsic::amdgcn_dot4_f32_fp8_fp8: + case Intrinsic::amdgcn_dot4_f32_bf8_bf8: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: @@ -4836,7 +4862,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_tr: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index 552380d54dfd..6f1236fd3b7d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -105,7 +105,8 @@ constexpr unsigned FeaturesToCheck[] = {AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureDot8Insts, AMDGPU::FeatureExtendedImageInsts, AMDGPU::FeatureSMemRealTime, - AMDGPU::FeatureSMemTimeInst}; + AMDGPU::FeatureSMemTimeInst, + AMDGPU::FeatureGWS}; FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) { FeatureBitset Result = Features; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index fc47b02c98e0..0c759e7f3b09 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -112,7 +112,7 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { // By default, for code object v5 and later, track only the minimum scratch // size - if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || + if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || STI.getTargetTriple().getOS() == Triple::AMDPAL) { if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) AssumedStackSizeForDynamicSizeObjects = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 4cc8871a00fe..67263f23b983 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,6 +237,7 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; +def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; @@ -279,9 +280,11 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>; @@ -295,9 +298,11 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>; @@ -311,9 +316,11 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>; @@ -327,9 +334,11 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_live_mask>; @@ -405,6 +414,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>; def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>; def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>; def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>; +def : SourceOfDivergence<int_amdgcn_global_load_tr>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index f19c57668564..bcc7dedf3229 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -571,7 +571,7 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { // Assume all implicit inputs are used by default const Module *M = F.getParent(); unsigned NBytes = - AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; + AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", NBytes); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0f3bb3e7b0d8..b8a7a5e20802 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerI1CopiesPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); initializeSILowerWWMCopiesPass(*PR); + initializeAMDGPUMarkLastScratchLoadPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -1424,6 +1425,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { addPreRewrite(); addPass(&VirtRegRewriterID); + addPass(&AMDGPUMarkLastScratchLoadID); + return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ba79affe683d..489cf85693ed 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -346,7 +346,7 @@ public: } bool isVRegWithInputMods() const; - bool isT16VRegWithInputMods() const; + template <bool IsFake16> bool isT16VRegWithInputMods() const; bool isSDWAOperand(MVT type) const; bool isSDWAFP16Operand() const; @@ -1303,10 +1303,8 @@ private: unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks, unsigned &SGPRBlocks); bool ParseDirectiveAMDGCNTarget(); + bool ParseDirectiveAMDHSACodeObjectVersion(); bool ParseDirectiveAMDHSAKernel(); - bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); - bool ParseDirectiveHSACodeObjectVersion(); - bool ParseDirectiveHSACodeObjectISA(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); // TODO: Possibly make subtargetHasRegister const. @@ -1688,6 +1686,7 @@ private: bool validateMIMGD16(const MCInst &Inst); bool validateMIMGMSAA(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); + bool validateNeg(const MCInst &Inst, int OpName); bool validateDPP(const MCInst &Inst, const OperandVector &Operands); bool validateVccOperand(unsigned Reg) const; bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands); @@ -2055,8 +2054,9 @@ bool AMDGPUOperand::isVRegWithInputMods() const { AsmParser->getFeatureBits()[AMDGPU::FeatureDPALU_DPP]); } -bool AMDGPUOperand::isT16VRegWithInputMods() const { - return isRegClass(AMDGPU::VGPR_32_Lo128RegClassID); +template <bool IsFake16> bool AMDGPUOperand::isT16VRegWithInputMods() const { + return isRegClass(IsFake16 ? AMDGPU::VGPR_32_Lo128RegClassID + : AMDGPU::VGPR_16_Lo128RegClassID); } bool AMDGPUOperand::isSDWAOperand(MVT type) const { @@ -4357,6 +4357,41 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) { + assert(OpName == AMDGPU::OpName::neg_lo || OpName == AMDGPU::OpName::neg_hi); + + const unsigned Opc = Inst.getOpcode(); + uint64_t TSFlags = MII.get(Opc).TSFlags; + + // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2) + if (!(TSFlags & SIInstrFlags::IsDOT)) + return true; + + int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); + if (NegIdx == -1) + return true; + + unsigned Neg = Inst.getOperand(NegIdx).getImm(); + + // Instructions that have neg_lo or neg_hi operand but neg modifier is allowed + // on some src operands but not allowed on other. + // It is convenient that such instructions don't have src_modifiers operand + // for src operands that don't allow neg because they also don't allow opsel. + + int SrcMods[3] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + + for (unsigned i = 0; i < 3; ++i) { + if (!AMDGPU::hasNamedOperand(Opc, SrcMods[i])) { + if (Neg & (1 << i)) + return false; + } + } + + return true; +} + bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, const OperandVector &Operands) { const unsigned Opc = Inst.getOpcode(); @@ -4834,6 +4869,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid op_sel operand"); return false; } + if (!validateNeg(Inst, AMDGPU::OpName::neg_lo)) { + Error(getImmLoc(AMDGPUOperand::ImmTyNegLo, Operands), + "invalid neg_lo operand"); + return false; + } + if (!validateNeg(Inst, AMDGPU::OpName::neg_hi)) { + Error(getImmLoc(AMDGPUOperand::ImmTyNegHi, Operands), + "invalid neg_hi operand"); + return false; + } if (!validateDPP(Inst, Operands)) { return false; } @@ -5087,20 +5132,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) { return false; } -bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, - uint32_t &Minor) { - if (ParseAsAbsoluteExpression(Major)) - return TokError("invalid major version"); - - if (!trySkipToken(AsmToken::Comma)) - return TokError("minor version number required, comma expected"); - - if (ParseAsAbsoluteExpression(Minor)) - return TokError("invalid minor version"); - - return false; -} - bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() { if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) return TokError("directive only supported for amdgcn architecture"); @@ -5566,63 +5597,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { } } - getTargetStreamer().EmitAmdhsaKernelDescriptor( - getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, - ReserveFlatScr, AMDGPU::getAmdhsaCodeObjectVersion()); + getTargetStreamer().EmitAmdhsaKernelDescriptor(getSTI(), KernelName, KD, + NextFreeVGPR, NextFreeSGPR, + ReserveVCC, ReserveFlatScr); return false; } -bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { - uint32_t Major; - uint32_t Minor; - - if (ParseDirectiveMajorMinor(Major, Minor)) +bool AMDGPUAsmParser::ParseDirectiveAMDHSACodeObjectVersion() { + uint32_t Version; + if (ParseAsAbsoluteExpression(Version)) return true; - getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor); - return false; -} - -bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { - uint32_t Major; - uint32_t Minor; - uint32_t Stepping; - StringRef VendorName; - StringRef ArchName; - - // If this directive has no arguments, then use the ISA version for the - // targeted GPU. - if (isToken(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); - getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor, - ISA.Stepping, - "AMD", "AMDGPU"); - return false; - } - - if (ParseDirectiveMajorMinor(Major, Minor)) - return true; - - if (!trySkipToken(AsmToken::Comma)) - return TokError("stepping version number required, comma expected"); - - if (ParseAsAbsoluteExpression(Stepping)) - return TokError("invalid stepping version"); - - if (!trySkipToken(AsmToken::Comma)) - return TokError("vendor name required, comma expected"); - - if (!parseString(VendorName, "invalid vendor name")) - return true; - - if (!trySkipToken(AsmToken::Comma)) - return TokError("arch name required, comma expected"); - - if (!parseString(ArchName, "invalid arch name")) - return true; - - getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping, - VendorName, ArchName); + getTargetStreamer().EmitDirectiveAMDHSACodeObjectVersion(Version); return false; } @@ -5909,16 +5895,13 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amdhsa_kernel") return ParseDirectiveAMDHSAKernel(); + if (IDVal == ".amdhsa_code_object_version") + return ParseDirectiveAMDHSACodeObjectVersion(); + // TODO: Restructure/combine with PAL metadata directive. if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin) return ParseDirectiveHSAMetadata(); } else { - if (IDVal == ".hsa_code_object_version") - return ParseDirectiveHSACodeObjectVersion(); - - if (IDVal == ".hsa_code_object_isa") - return ParseDirectiveHSACodeObjectISA(); - if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); @@ -8091,9 +8074,8 @@ void AMDGPUAsmParser::onBeginOfFile() { return; if (!getTargetStreamer().getTargetID()) - getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString(), - // TODO: Should try to check code object version from directive??? - AMDGPU::getAmdhsaCodeObjectVersion()); + getTargetStreamer().initializeTargetID(getSTI(), + getSTI().getFeatureString()); if (isHsaAbi(getSTI())) getTargetStreamer().EmitDirectiveAMDGCNTarget(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td index 9e99d382ed9b..ae0955f0cf6a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -485,7 +485,7 @@ class MUBUF_Load_Pseudo <string opName, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, - RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret, + RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass, RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret> : MUBUF_Pseudo<opName, !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), @@ -601,7 +601,7 @@ class MUBUF_Store_Pseudo <string opName, int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE, hasGFX12Enc>.ret, + getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasGFX12Enc>.ret, getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret, pattern>, MUBUF_SetupAddr<addrKindCopy> { @@ -780,9 +780,8 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, RegisterClass vdataClass, - ValueType vdataType, - bit isFP = isFloatType<vdataType>.ret> { - let FPAtomic = isFP in { + ValueType vdataType> { + let FPAtomic = vdataType.isFP in { def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, MUBUFAddr64Table <0, NAME>; def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>, @@ -804,9 +803,8 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, - SDPatternOperator atomic, - bit isFP = isFloatType<vdataType>.ret> { - let FPAtomic = isFP in { + SDPatternOperator atomic> { + let FPAtomic = vdataType.isFP in { def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0, [(set vdataType:$vdata, (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), @@ -1243,6 +1241,17 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; +let SubtargetPredicate = isGFX12Plus in { +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < + "buffer_atomic_cond_sub_u32", VGPR_32, i32 +>; + +let FPAtomic = 1 in +defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < + "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16 +>; +} + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1560,27 +1569,28 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt.Size); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass; let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) - getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, + data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)), !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)), + (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, data_vt_RC)), !if(!eq(vt, i32), sub0, sub0_sub1)), OffsetResDag) >; defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) - getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); def : GCNPat< (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), data_vt:$vdata_in)), !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)), + (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, data_vt_RC)), !if(!eq(vt, i32), sub0, sub0_sub1)), Addr64ResDag) >; @@ -1628,12 +1638,12 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar CachePolicy = !if(!eq(RtnMode, "ret"), - (set_glc $cachepolicy), (timm:$cachepolicy)); + (extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary)); let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), - timm:$offset, timm:$cachepolicy, 0)), + timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1641,7 +1651,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), - timm:$offset, timm:$cachepolicy, timm)), + timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1649,7 +1659,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, - (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)), + (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1657,7 +1667,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, - (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)), + (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1703,9 +1713,17 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; -let SubtargetPredicate = HasAtomicCSubNoRtnInsts in +let OtherPredicates = [HasAtomicCSubNoRtnInsts] in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = isGFX12Plus in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">; + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; +} + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; @@ -1726,35 +1744,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - timm:$offset, timm:$cachepolicy) + timm:$offset, (extract_cpol $auxiliary)) >; def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary)) >; } @@ -1770,14 +1788,22 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { + let SubtargetPredicate = isGFX9Only in defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; + + let SubtargetPredicate = isGFX12Plus in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>; } // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] let OtherPredicates = [HasAtomicFaddRtnInsts] in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { + let SubtargetPredicate = isGFX9Only in defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; + + let SubtargetPredicate = isGFX12Plus in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>; } // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] let OtherPredicates = [isGFX90APlus] in { @@ -1791,10 +1817,11 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); - defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), - (timm:$cachepolicy)); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), + (extract_cpol_set_glc $auxiliary), + (extract_cpol $auxiliary)); defvar SrcRC = getVregSrcForVT<vt>.ret; - defvar DataRC = getVregSrcForVT<data_vt>.ret; + defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass; defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3); @@ -1804,7 +1831,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri def : GCNPat< (vt (Op vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), - timm:$offset, timm:$cachepolicy, 0)), + timm:$offset, timm:$auxiliary, 0)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG OffsetResDag, SubLo), OffsetResDag) @@ -1818,7 +1845,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (vt (Op vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG IdxenResDag, SubLo), IdxenResDag) @@ -1832,7 +1859,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (vt (Op vt:$data, vt:$cmp, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG OffenResDag, SubLo), OffenResDag) @@ -1846,7 +1873,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri (vt (Op vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG BothenResDag, SubLo), BothenResDag) @@ -2608,6 +2635,7 @@ defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049, defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">; +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>; defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">; def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">; defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">; @@ -2632,6 +2660,8 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033, defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">; defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; +defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; //===----------------------------------------------------------------------===// // MUBUF - GFX10. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td index 3cccd8c50e66..d09e1ef3bcb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -437,6 +437,12 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, let has_gds = 0; } +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} + defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">; defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">; defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">; @@ -486,10 +492,10 @@ let SubtargetPredicate = isGFX90APlus in { } // End SubtargetPredicate = isGFX90APlus let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { - defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; - defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; - defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; - defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; + defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">; + defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; + defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">; + defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; @@ -732,9 +738,22 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; let SubtargetPredicate = isGFX12Plus in { +defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; +defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32, "ds_cond_sub_u32">; defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">; +multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, + ValueType vt, string frag> { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_local_addrspace")>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>; +} + +defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus //===----------------------------------------------------------------------===// @@ -954,12 +973,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, - bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { - let AddedComplexity = complexity; -} - multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; @@ -1237,8 +1250,14 @@ defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">; defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">; defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">; +defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>; defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; +defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>; defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>; +defm DS_PK_ADD_F16 : DS_Real_gfx12<0x09a>; +defm DS_PK_ADD_RTN_F16 : DS_Real_gfx12<0x0aa>; +defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>; +defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>; //===----------------------------------------------------------------------===// // GFX11. @@ -1248,7 +1267,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { multiclass DS_Real_gfx11<bits<8> op> { def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, !cast<DS_Pseudo>(NAME), - SIEncodingFamily.GFX11>; + SIEncodingFamily.GFX11>; } multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 9dff3f6c2efd..86096b0d80b4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS); + if (Res) + break; } // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); @@ -2180,7 +2184,8 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); } - if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5) + // FIXME: We should be looking at the ELF header ABI version for this. + if (AMDGPU::getDefaultAMDHSACodeObjectVersion() >= AMDGPU::AMDHSA_COV5) PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack", KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td index 16a8b770e057..cb830b128df8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -535,7 +535,6 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret, RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), @@ -544,7 +543,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let PseudoInstr = NAME; - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } } @@ -555,7 +554,6 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret, RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), @@ -563,7 +561,7 @@ multiclass FLAT_Atomic_Pseudo_RTN< " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1> { - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } } @@ -574,10 +572,9 @@ multiclass FLAT_Atomic_Pseudo< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret, RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { - defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>; - defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>; + defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>; + defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>; } multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< @@ -586,7 +583,6 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret, RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def "" : FLAT_AtomicNoRet_Pseudo <opName, @@ -597,7 +593,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< AtomicNoRet <opName, 0> { let has_saddr = 1; let PseudoInstr = NAME; - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; } def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, @@ -609,7 +605,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR"; - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; } } @@ -619,7 +615,6 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType<data_vt>.ret, RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { @@ -630,7 +625,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1> { let has_saddr = 1; - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, @@ -642,7 +637,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR_RTN"; - let FPAtomic = isFP; + let FPAtomic = data_vt.isFP; } } @@ -823,6 +818,7 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { let SubtargetPredicate = isGFX12Plus in { defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>; + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>; } // End SubtargetPredicate = isGFX12Plus defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; @@ -949,6 +945,7 @@ defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ssho defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; let SubtargetPredicate = isGFX12Plus in { + defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>; defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">; @@ -995,6 +992,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d } // End SubtargetPredicate = HasFlatScratchInsts +let SubtargetPredicate = isGFX12Plus in { + let WaveSizePredicate = isWave32 in { + defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>; + defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>; + } + let WaveSizePredicate = isWave64 in { + defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>; + defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>; + } +} // End SubtargetPredicate = isGFX12Plus + let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>; @@ -1100,23 +1108,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; -multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, - ValueType data_vt = vt, bit isIntr = 0> { - defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size)); +multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + + defvar noRtnNode = !cast<PatFrags>(node); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } -multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, - ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size)); +multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix, + ValueType vt> : + FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>; + +multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> : + FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + + +multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + + defvar rtnNode = !cast<SDPatternOperator>(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } +multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, + ValueType vt> : + FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>; + +multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> : + FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>; + + multiclass FlatAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>, @@ -1296,6 +1324,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64 defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; } // end foreach as +let SubtargetPredicate = isGFX12Plus in { + defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; +} + def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; @@ -1557,8 +1592,28 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64> defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; +let SubtargetPredicate = isGFX12Plus in { + defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; +} + let OtherPredicates = [isGFX12Plus] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; + + let WaveSizePredicate = isWave32 in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>; + } + let WaveSizePredicate = isWave64 in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>; + } } let OtherPredicates = [isGFX10Plus] in { @@ -2523,7 +2578,8 @@ multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string ali def _alias_gfx12 : MnemonicAlias<alias, opName>, Requires<[isGFX12Plus]>; } -multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : +multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME), + int renamed = false, string alias = ""> : VFLAT_Aliases_gfx12<ps, opName, renamed, alias> { def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> { let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); @@ -2557,20 +2613,24 @@ multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> { } } -multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME), + int renamed = false, string alias = ""> : VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>, VFLAT_Real_RTN_gfx12<op, ps, opName>; -multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : +multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME), + int renamed = false, string alias = ""> : VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>, VFLAT_Real_SADDR_gfx12<op, ps, opName>; -multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : +multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME), + int renamed = false, string alias = ""> : VGLOBAL_Real_AllAddr_gfx12<op, ps, opName, renamed, alias>, VFLAT_Real_RTN_gfx12<op, ps, opName>, VFLAT_Real_SADDR_RTN_gfx12<op, ps, opName>; -multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false> : +multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME), + int renamed = false> : VFLAT_Real_Base_gfx12<op, ps, opName, renamed>, VFLAT_Real_SADDR_gfx12<op, ps, opName>, VFLAT_Real_ST_gfx12<op, ps, opName>, @@ -2591,14 +2651,14 @@ defm FLAT_STORE_B32 : VFLAT_Real_Base_gfx12<0x01a, "FLAT_STORE_DW defm FLAT_STORE_B64 : VFLAT_Real_Base_gfx12<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>; defm FLAT_STORE_B96 : VFLAT_Real_Base_gfx12<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>; defm FLAT_STORE_B128 : VFLAT_Real_Base_gfx12<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>; -defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">; -defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">; -defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">; -defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">; -defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">; -defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">; -defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">; -defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">; +defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16">; +defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16">; +defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16">; +defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI">; +defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI">; +defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI">; +defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI">; +defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI">; defm FLAT_ATOMIC_SWAP_B32 : VFLAT_Real_Atomics_gfx12<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>; defm FLAT_ATOMIC_CMPSWAP_B32 : VFLAT_Real_Atomics_gfx12<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>; defm FLAT_ATOMIC_ADD_U32 : VFLAT_Real_Atomics_gfx12<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>; @@ -2626,9 +2686,12 @@ defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMI defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>; defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>; defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>; +defm FLAT_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMIC_COND_SUB_U32", "flat_atomic_cond_sub_u32">; defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">; defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">; -defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; +defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; +defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>; @@ -2645,16 +2708,16 @@ defm GLOBAL_STORE_B32 : VGLOBAL_Real_AllAddr_gfx12<0x01a, "GLOBAL_S defm GLOBAL_STORE_B64 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>; defm GLOBAL_STORE_B96 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>; defm GLOBAL_STORE_B128 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>; -defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">; -defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">; -defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">; -defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">; -defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">; -defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">; -defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">; +defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16">; +defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16">; +defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16">; +defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI">; +defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI">; +defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI">; +defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI">; +defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI">; +defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID">; +defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID">; defm GLOBAL_ATOMIC_SWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>; defm GLOBAL_ATOMIC_CMPSWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>; @@ -2683,14 +2746,28 @@ defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_A defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>; defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>; defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_ATOMIC_COND_SUB_U32", "global_atomic_cond_sub_u32">; defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">; +defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; + +let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">; + defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">; +} + +let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in { + defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">; + defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">; +} + +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; -defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">; -defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">; -defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">; +defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; +defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; +defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f>; // ENC_VSCRATCH. defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; @@ -2707,11 +2784,11 @@ defm SCRATCH_STORE_B32 : VSCRATCH_Real_AllAddr_gfx12<0x1a, "SCRATCH_ defm SCRATCH_STORE_B64 : VSCRATCH_Real_AllAddr_gfx12<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>; defm SCRATCH_STORE_B96 : VSCRATCH_Real_AllAddr_gfx12<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>; defm SCRATCH_STORE_B128 : VSCRATCH_Real_AllAddr_gfx12<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>; -defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">; -defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">; -defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">; -defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">; -defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">; -defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">; -defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">; -defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">; +defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16">; +defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16">; +defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16">; +defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI">; +defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI">; +defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI">; +defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI">; +defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI">; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index a75082268c77..94d28dc0a2c7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -274,8 +274,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } - if (auto *Mod0 = TII->getNamedOperand(OrigMI, - AMDGPU::OpName::src0_modifiers)) { + auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers); + if (Mod0) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); assert(HasVOP3DPP || @@ -298,8 +298,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; - if (auto *Mod1 = TII->getNamedOperand(OrigMI, - AMDGPU::OpName::src1_modifiers)) { + auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers); + if (Mod1) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); assert(HasVOP3DPP || @@ -330,8 +330,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src1); ++NumOperands; } - if (auto *Mod2 = - TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + + auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers); + if (Mod2) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); assert(HasVOP3DPP || @@ -350,6 +351,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src2); ++NumOperands; } + if (HasVOP3DPP) { auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) { @@ -368,7 +370,13 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, // all 1. if (auto *OpSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { - auto OpSel = OpSelOpr->getImm(); + int64_t OpSel = 0; + OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); + OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); + OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); + if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI)) + OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; + if (OpSel != 0) { LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); Fail = true; @@ -379,7 +387,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } if (auto *OpSelHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { - auto OpSelHi = OpSelHiOpr->getImm(); + int64_t OpSelHi = 0; + OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); + OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); + OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check // the bitmask for 3 op_sel_hi bits set assert(Src2 && "Expected vop3p with 3 operands"); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index bcd93e30d6c2..b6e4e65ff5b0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -163,6 +163,7 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, static bool isPermlane(const MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || + Opcode == AMDGPU::V_PERMLANE64_B32 || Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; @@ -1143,6 +1144,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { if (!ST.hasVMEMtoScalarWriteHazard()) return false; + assert(!ST.hasExtendedWaitCounts()); if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) return false; @@ -1189,6 +1191,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { if (!ST.hasSMEMtoVectorWriteHazard()) return false; + assert(!ST.hasExtendedWaitCounts()); if (!SIInstrInfo::isVALU(*MI)) return false; @@ -1242,7 +1245,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { case AMDGPU::S_WAITCNT: { const int64_t Imm = MI.getOperand(0).getImm(); AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); - return (Decoded.LgkmCnt == 0); + // DsCnt corresponds to LGKMCnt here. + return (Decoded.DsCnt == 0); } default: // SOPP instructions cannot mitigate the hazard. @@ -1272,7 +1276,11 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { } bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { - if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) + if (!ST.hasVcmpxExecWARHazard()) + return false; + assert(!ST.hasExtendedWaitCounts()); + + if (!SIInstrInfo::isVALU(*MI)) return false; const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1342,6 +1350,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { return false; assert(ST.hasLdsBranchVmemWARHazard()); + assert(!ST.hasExtendedWaitCounts()); auto IsHazardInst = [](const MachineInstr &MI) { if (SIInstrInfo::isDS(MI)) @@ -1451,6 +1460,8 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); }; bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); + // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT + // according to the type of VMEM instruction. auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || @@ -1476,11 +1487,11 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { } bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { - if (!ST.isWave64()) - return false; if (!ST.hasVALUPartialForwardingHazard()) return false; - if (!SIInstrInfo::isVALU(*MI)) + assert(!ST.hasExtendedWaitCounts()); + + if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI)) return false; SmallSetVector<Register, 4> SrcVGPRs; @@ -1627,6 +1638,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { if (!ST.hasVALUTransUseHazard()) return false; + assert(!ST.hasExtendedWaitCounts()); + if (!SIInstrInfo::isVALU(*MI)) return false; @@ -1766,6 +1779,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { if (!ST.hasShift64HighRegBug()) return false; + assert(!ST.hasExtendedWaitCounts()); switch (MI->getOpcode()) { default: @@ -1895,6 +1909,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { if (!ST.hasFPAtomicToDenormModeHazard()) return 0; + assert(!ST.hasExtendedWaitCounts()); if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) return 0; @@ -2720,11 +2735,11 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { } bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { - if (!ST.isWave64()) - return false; if (!ST.hasVALUMaskWriteHazard()) return false; - if (!SIInstrInfo::isSALU(*MI)) + assert(!ST.hasExtendedWaitCounts()); + + if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) return false; // The hazard sequence is three instructions: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 4c9ad9b5bcf7..272cc7fa6bc6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -237,7 +237,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); - if (!ST->hasNSAEncoding()) + if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) return false; MRI = &MF.getRegInfo(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 85d062a9a6f5..8019b98b1c68 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -155,6 +155,7 @@ protected: bool HasDot10Insts = false; bool HasMAIInsts = false; bool HasFP8Insts = false; + bool HasFP8ConversionInsts = false; bool HasPkFmacF16Inst = false; bool HasAtomicDsPkAdd16Insts = false; bool HasAtomicFlatPkAdd16Insts = false; @@ -165,6 +166,8 @@ protected: bool HasAtomicCSubNoRtnInsts = false; bool HasAtomicGlobalPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; + bool HasDefaultComponentZero = false; + bool HasDefaultComponentBroadcast = false; bool SupportsSRAMECC = false; // This should not be used directly. 'TargetID' tracks the dynamic settings @@ -295,12 +298,16 @@ public: unsigned getMaxWaveScratchSize() const { // See COMPUTE_TMPRING_SIZE.WAVESIZE. - if (getGeneration() < GFX11) { - // 13-bit field in units of 256-dword. - return (256 * 4) * ((1 << 13) - 1); + if (getGeneration() >= GFX12) { + // 18-bit field in units of 64-dword. + return (64 * 4) * ((1 << 18) - 1); } - // 15-bit field in units of 64-dword. - return (64 * 4) * ((1 << 15) - 1); + if (getGeneration() == GFX11) { + // 15-bit field in units of 64-dword. + return (64 * 4) * ((1 << 15) - 1); + } + // 13-bit field in units of 256-dword. + return (256 * 4) * ((1 << 13) - 1); } /// Return the number of high bits known to be zero for a frame index. @@ -423,6 +430,8 @@ public: return GFX9Insts; } + bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } + TrapHandlerAbi getTrapHandlerAbi() const { return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; } @@ -772,6 +781,8 @@ public: return HasFP8Insts; } + bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; } + bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; } @@ -802,6 +813,12 @@ public: bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } + bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } + + bool hasDefaultComponentBroadcast() const { + return HasDefaultComponentBroadcast; + } + bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } @@ -838,7 +855,9 @@ public: return getGeneration() < SEA_ISLANDS; } - bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasInstPrefetch() const { + return getGeneration() == GFX10 || getGeneration() == GFX11; + } bool hasPrefetch() const { return GFX12Insts; } @@ -984,6 +1003,8 @@ public: bool hasNSAEncoding() const { return HasNSAEncoding; } + bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } + bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } unsigned getNSAMaxSize(bool HasSampler = false) const { @@ -1131,14 +1152,14 @@ public: bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } bool hasVALUPartialForwardingHazard() const { - return getGeneration() >= GFX11; + return getGeneration() == GFX11; } bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } + bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts; } @@ -1177,6 +1198,10 @@ public: bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } + /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt + /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. + bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1251,6 +1276,14 @@ public: // \returns true if the target has WG_RR_MODE kernel descriptor mode bit bool hasRrWGMode() const { return getGeneration() >= GFX12; } + /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative + /// values. + bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } + + // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead + // of sign-extending. + bool hasGetPCZeroExtension() const { return GFX12Insts; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index f91f36ed851b..8eb246ef57c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -232,13 +232,11 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; bool HasRelocationAddend; uint8_t OSABI = ELF::ELFOSABI_NONE; - uint8_t ABIVersion = 0; public: - ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) : - AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), - HasRelocationAddend(TT.getOS() == Triple::AMDHSA), - ABIVersion(ABIVersion) { + ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) + : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), + HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { switch (TT.getOS()) { case Triple::AMDHSA: OSABI = ELF::ELFOSABI_AMDGPU_HSA; @@ -256,8 +254,7 @@ public: std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { - return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, - ABIVersion); + return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend); } }; @@ -267,6 +264,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { - return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(), - getHsaAbiVersion(&STI).value_or(0)); + return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple()); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 58eed81e0755..2d960a32339f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -18,8 +18,7 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend, - uint8_t ABIVersion); + AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend); protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, @@ -29,12 +28,10 @@ protected: } // end anonymous namespace -AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, - uint8_t OSABI, - bool HasRelocationAddend, - uint8_t ABIVersion) - : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU, - HasRelocationAddend, ABIVersion) {} +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, + bool HasRelocationAddend) + : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU, + HasRelocationAddend) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, @@ -100,9 +97,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend, - uint8_t ABIVersion) { + bool HasRelocationAddend) { return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, - HasRelocationAddend, - ABIVersion); + HasRelocationAddend); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 6c7977e22599..e73e53aa270f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1260,14 +1260,19 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, int NumOps = 0; int Ops[3]; - for (int OpName : { AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers }) { - int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); - if (Idx == -1) + std::pair<int, int> MOps[] = { + {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src0}, + {AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src1}, + {AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::src2}}; + int DefaultValue = (Mod == SISrcMods::OP_SEL_1); + + for (auto [SrcMod, Src] : MOps) { + if (!AMDGPU::hasNamedOperand(Opc, Src)) break; - Ops[NumOps++] = MI->getOperand(Idx).getImm(); + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, SrcMod); + Ops[NumOps++] = + (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue; } const bool HasDstSel = diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index de1abaf29c56..c3e87244c0c8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -562,7 +562,48 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI, void AMDGPUMCCodeEmitter::getMachineOpValueT16( const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - llvm_unreachable("TODO: Implement getMachineOpValueT16()."); + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg()) { + unsigned Enc = MRI.getEncodingValue(MO.getReg()); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + bool IsVGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR; + Op = Idx | (IsVGPR << 8); + return; + } + getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); + // VGPRs include the suffix/op_sel bit in the register encoding, but + // immediates and SGPRs include it in src_modifiers. Therefore, copy the + // op_sel bit from the src operands into src_modifier operands if Op is + // src_modifiers and the corresponding src is a VGPR + int SrcMOIdx = -1; + assert(OpNo < INT_MAX); + if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src0_modifiers)) { + SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + int VDstMOIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); + if (VDstMOIdx != -1) { + auto DstReg = MI.getOperand(VDstMOIdx).getReg(); + if (AMDGPU::isHi(DstReg, MRI)) + Op |= SISrcMods::DST_OP_SEL; + } + } else if ((int)OpNo == AMDGPU::getNamedOperandIdx( + MI.getOpcode(), AMDGPU::OpName::src1_modifiers)) + SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + else if ((int)OpNo == AMDGPU::getNamedOperandIdx( + MI.getOpcode(), AMDGPU::OpName::src2_modifiers)) + SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src2); + if (SrcMOIdx == -1) + return; + + const MCOperand &SrcMO = MI.getOperand(SrcMOIdx); + if (!SrcMO.isReg()) + return; + auto SrcReg = SrcMO.getReg(); + if (AMDGPU::isSGPR(SrcReg, &MRI)) + return; + if (AMDGPU::isHi(SrcReg, MRI)) + Op |= SISrcMods::OP_SEL_0; } void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 006115ba14fc..3ef00f75735b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -42,8 +42,8 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, std::unique_ptr<MCObjectTargetWriter> createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend, uint8_t ABIVersion); -} // End llvm namespace + bool HasRelocationAddend); +} // namespace llvm #define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index e135a4e25dd1..d7e8ab76d5ff 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -35,27 +36,6 @@ using namespace llvm::AMDGPU; // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// -static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor, - uint32_t &Stepping, bool Sramecc, bool Xnack) { - if (Major == 9 && Minor == 0) { - switch (Stepping) { - case 0: - case 2: - case 4: - case 6: - if (Xnack) - Stepping++; - } - } -} - -bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { - HSAMD::Metadata HSAMetadata; - if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) - return false; - return EmitHSAMetadata(HSAMetadata); -} - bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { msgpack::Document HSAMetadataDoc; if (!HSAMetadataDoc.fromYAML(HSAMetadataString)) @@ -238,21 +218,10 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() { OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n"; } -void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion( - uint32_t Major, uint32_t Minor) { - OS << "\t.hsa_code_object_version " << - Twine(Major) << "," << Twine(Minor) << '\n'; -} - -void -AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major, - uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName) { - convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny()); - OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << "," - << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n"; +void AMDGPUTargetAsmStreamer::EmitDirectiveAMDHSACodeObjectVersion( + unsigned COV) { + AMDGPUTargetStreamer::EmitDirectiveAMDHSACodeObjectVersion(COV); + OS << "\t.amdhsa_code_object_version " << COV << '\n'; } void @@ -284,18 +253,6 @@ bool AMDGPUTargetAsmStreamer::EmitISAVersion() { } bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( - const AMDGPU::HSAMD::Metadata &HSAMetadata) { - std::string HSAMetadataString; - if (HSAMD::toString(HSAMetadata, HSAMetadataString)) - return false; - - OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n'; - OS << HSAMetadataString << '\n'; - OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n'; - return true; -} - -bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( msgpack::Document &HSAMetadataDoc, bool Strict) { HSAMD::V3::MetadataVerifier Verifier(Strict); if (!Verifier.verify(HSAMetadataDoc.getRoot())) @@ -336,7 +293,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion) { + bool ReserveVCC, bool ReserveFlatScr) { IsaVersion IVersion = getIsaVersion(STI.getCPU()); OS << "\t.amdhsa_kernel " << KernelName << '\n'; @@ -529,6 +486,8 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { void AMDGPUTargetELFStreamer::finish() { MCAssembler &MCA = getStreamer().getAssembler(); MCA.setELFHeaderEFlags(getEFlags()); + MCA.getWriter().setOverrideABIVersion( + getELFABIVersion(STI.getTargetTriple(), CodeObjectVersion)); std::string Blob; const char *Vendor = getPALMetadata()->getVendor(); @@ -616,17 +575,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() { unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() { assert(isHsaAbi(STI)); - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { - switch (*HsaAbiVer) { - case ELF::ELFABIVERSION_AMDGPU_HSA_V3: - return getEFlagsV3(); - case ELF::ELFABIVERSION_AMDGPU_HSA_V4: - case ELF::ELFABIVERSION_AMDGPU_HSA_V5: - return getEFlagsV4(); - } - } - - llvm_unreachable("HSA OS ABI Version identification must be defined"); + return getEFlagsV4(); } unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() { @@ -699,44 +648,6 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV4() { void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {} -void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( - uint32_t Major, uint32_t Minor) { - - EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()), - ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { - OS.emitInt32(Major); - OS.emitInt32(Minor); - }); -} - -void -AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major, - uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName) { - uint16_t VendorNameSize = VendorName.size() + 1; - uint16_t ArchNameSize = ArchName.size() + 1; - - unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) + - sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + - VendorNameSize + ArchNameSize; - - convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny()); - EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()), - ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) { - OS.emitInt16(VendorNameSize); - OS.emitInt16(ArchNameSize); - OS.emitInt32(Major); - OS.emitInt32(Minor); - OS.emitInt32(Stepping); - OS.emitBytes(VendorName); - OS.emitInt8(0); // NULL terminate VendorName - OS.emitBytes(ArchName); - OS.emitInt8(0); // NULL terminate ArchName - }); -} - void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { @@ -818,30 +729,6 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc, return true; } -bool AMDGPUTargetELFStreamer::EmitHSAMetadata( - const AMDGPU::HSAMD::Metadata &HSAMetadata) { - std::string HSAMetadataString; - if (HSAMD::toString(HSAMetadata, HSAMetadataString)) - return false; - - // Create two labels to mark the beginning and end of the desc field - // and a MCExpr to calculate the size of the desc field. - auto &Context = getContext(); - auto *DescBegin = Context.createTempSymbol(); - auto *DescEnd = Context.createTempSymbol(); - auto *DescSZ = MCBinaryExpr::createSub( - MCSymbolRefExpr::create(DescEnd, Context), - MCSymbolRefExpr::create(DescBegin, Context), Context); - - EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA, - [&](MCELFStreamer &OS) { - OS.emitLabel(DescBegin); - OS.emitBytes(HSAMetadataString); - OS.emitLabel(DescEnd); - }); - return true; -} - bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader( const MCSubtargetInfo &STI) { for (int i = 0; i < 64; ++i) { @@ -889,8 +776,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - unsigned CodeObjectVersion) { + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) { auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 55b5246c9210..7f8ddc42b2ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -37,23 +37,24 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { protected: // TODO: Move HSAMetadataStream to AMDGPUTargetStreamer. std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID; + unsigned CodeObjectVersion; MCContext &getContext() const { return Streamer.getContext(); } public: - AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + AMDGPUTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), + // Assume the default COV for now, EmitDirectiveAMDHSACodeObjectVersion + // will update this if it is encountered. + CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {} AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; } virtual void EmitDirectiveAMDGCNTarget(){}; - virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, - uint32_t Minor){}; - - virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, - uint32_t Stepping, - StringRef VendorName, - StringRef ArchName){}; + virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) { + CodeObjectVersion = COV; + } virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header){}; @@ -66,9 +67,6 @@ public: virtual bool EmitISAVersion() { return true; } /// \returns True on success, false on failure. - virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString); - - /// \returns True on success, false on failure. virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString); /// Emit HSA Metadata @@ -98,8 +96,7 @@ public: virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - unsigned CodeObjectVersion){}; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {} static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); @@ -110,15 +107,12 @@ public: std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() { return TargetID; } - void initializeTargetID(const MCSubtargetInfo &STI, - unsigned CodeObjectVersion) { + void initializeTargetID(const MCSubtargetInfo &STI) { assert(TargetID == std::nullopt && "TargetID can only be initialized once"); TargetID.emplace(STI); - getTargetID()->setCodeObjectVersion(CodeObjectVersion); } - void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString, - unsigned CodeObjectVersion) { - initializeTargetID(STI, CodeObjectVersion); + void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) { + initializeTargetID(STI); assert(getTargetID() != std::nullopt && "TargetID is None"); getTargetID()->setTargetIDFromFeaturesString(FeatureString); @@ -134,12 +128,7 @@ public: void EmitDirectiveAMDGCNTarget() override; - void EmitDirectiveHSACodeObjectVersion(uint32_t Major, - uint32_t Minor) override; - - void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, - uint32_t Stepping, StringRef VendorName, - StringRef ArchName) override; + void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; @@ -154,9 +143,6 @@ public: bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; - - /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; /// \returns True on success, false on failure. @@ -165,8 +151,7 @@ public: void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - unsigned CodeObjectVersion) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -198,13 +183,6 @@ public: void EmitDirectiveAMDGCNTarget() override; - void EmitDirectiveHSACodeObjectVersion(uint32_t Major, - uint32_t Minor) override; - - void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, - uint32_t Stepping, StringRef VendorName, - StringRef ArchName) override; - void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; @@ -218,9 +196,6 @@ public: bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override; /// \returns True on success, false on failure. - bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; - - /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; /// \returns True on success, false on failure. @@ -229,9 +204,7 @@ public: void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, - unsigned CodeObjectVersion) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; }; - } #endif diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 240366c8e7da..3c7cd61444fa 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1553,6 +1553,11 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c> defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; +defm IMAGE_ATOMIC_PK_ADD_F16 : MIMG_Atomic <mimgopc<0x86, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_f16", 0, 1>; +defm IMAGE_ATOMIC_PK_ADD_BF16 : MIMG_Atomic <mimgopc<0x87, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_bf16", 0, 1>; +defm IMAGE_ATOMIC_ADD_FLT : MIMG_Atomic <mimgopc<0x83, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_add_flt", 0, 1>; +defm IMAGE_ATOMIC_MIN_FLT : MIMG_Atomic <mimgopc<0x84, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_min_num_flt", 0, 1, "image_atomic_min_flt">; +defm IMAGE_ATOMIC_MAX_FLT : MIMG_Atomic <mimgopc<0x85, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_max_num_flt", 0, 1, "image_atomic_max_flt">; defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>; let OtherPredicates = [HasExtendedImageInsts] in { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 674fd04f2fc1..159b2d440b31 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1619,8 +1619,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, for (unsigned i = 0; i < 4; i++) { RemapSwizzle[i] = i; if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) - ->getZExtValue(); + unsigned Idx = NewBldVec[i].getConstantOperandVal(1); if (i == Idx) isUnmovable[Idx] = true; } @@ -1628,8 +1627,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, for (unsigned i = 0; i < 4; i++) { if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) - ->getZExtValue(); + unsigned Idx = NewBldVec[i].getConstantOperandVal(1); if (isUnmovable[Idx]) continue; // Swap i and Idx @@ -2002,9 +2000,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { if (Reg->getReg() == R600::ALU_CONST) { - ConstantSDNode *Cst - = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); - Consts.push_back(Cst->getZExtValue()); + Consts.push_back(ParentNode->getConstantOperandVal(OtherSelIdx)); } } } @@ -2044,8 +2040,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); } } else { - ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0)); - uint64_t Value = C->getZExtValue(); + uint64_t Value = Src.getConstantOperandVal(0); if (Value == 0) { ImmReg = R600::ZERO; } else if (Value == 1) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 932c0d6216ce..c921e5a35d2d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -329,15 +329,16 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { } Value *Exec = popSaved(); - Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt(); + BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt(); if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) { Instruction *ExecDef = cast<Instruction>(Exec); BasicBlock *DefBB = ExecDef->getParent(); if (!DT->dominates(DefBB, BB)) { // Split edge to make Def dominate Use - FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); + FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); } - IRBuilder<>(FirstInsertionPt).CreateCall(EndCf, {Exec}); + IRBuilder<>(FirstInsertionPt->getParent(), FirstInsertionPt) + .CreateCall(EndCf, {Exec}); } return true; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h index b291400a947c..8ab66d4fd5b8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h @@ -400,6 +400,10 @@ enum CPol { TH_TYPE_STORE = 1 << 8, // TH_STORE policy TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not + + // Volatile (used to preserve/signal operation volatility for buffer + // operations not a real instruction bit) + VOLATILE = 1 << 31, }; } // namespace CPol @@ -1172,11 +1176,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 }; #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 #define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) -#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) +#define S_00B860_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12) +#define S_00B860_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12) #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) -#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) +#define S_0286E8_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12) +#define S_0286E8_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12) #define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 #define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index aa7639a0f186..2862a7787e75 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F16_t16_e64: case AMDGPU::V_MAX_F16_fake16_e64: case AMDGPU::V_MAX_F64_e64: + case AMDGPU::V_MAX_NUM_F64_e64: case AMDGPU::V_PK_MAX_F16: { if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; @@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { static int getOModValue(unsigned Opc, int64_t Val) { switch (Opc) { - case AMDGPU::V_MUL_F64_e64: { + case AMDGPU::V_MUL_F64_e64: + case AMDGPU::V_MUL_F64_pseudo_e64: { switch (Val) { case 0x3fe0000000000000: // 0.5 return SIOutMods::DIV2; @@ -1618,6 +1620,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { unsigned Op = MI.getOpcode(); switch (Op) { case AMDGPU::V_MUL_F64_e64: + case AMDGPU::V_MUL_F64_pseudo_e64: case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_t16_e64: case AMDGPU::V_MUL_F16_fake16_e64: @@ -1625,8 +1628,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || - ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 || - Op == AMDGPU::V_MUL_F16_t16_e64 || + ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 || + Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 || Op == AMDGPU::V_MUL_F16_fake16_e64) && MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) return std::pair(nullptr, SIOutMods::NONE); @@ -1655,6 +1658,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { return std::pair(RegOp, OMod); } case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: case AMDGPU::V_ADD_F16_t16_e64: @@ -1662,8 +1666,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || - ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 || - Op == AMDGPU::V_ADD_F16_t16_e64 || + ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 || + Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 || Op == AMDGPU::V_ADD_F16_fake16_e64) && MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) return std::pair(nullptr, SIOutMods::NONE); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0f89df144486..9d062eb156d5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister( TargetStackID::SGPRSpill); if (TRI->spillSGPRToVGPR() && - MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) { + MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true, + /*IsPrologEpilog=*/true)) { // 2: There's no free lane to spill, and no free register to save the // SGPR, so we're forced to take another VGPR to use for the spill. MFI->addToPrologEpilogSGPRSpills( @@ -188,7 +189,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(MFI->getGITPtrHigh()) .addReg(TargetReg, RegState::ImplicitDefine); } else { - const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); + const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo); BuildMI(MBB, I, DL, GetPC64, TargetReg); } Register GitPtrLo = MFI->getGITPtrLoReg(*MF); @@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) return; + MFI->shiftSpillPhysVGPRsToLowestRange(MF); + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5a9222e91588..cf947dccafac 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -855,7 +855,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16, MVT::i128}, + MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, @@ -1183,6 +1183,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = RsrcArg; } + auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); + if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) + Info.flags |= MachineMemOperand::MOVolatile; Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { unsigned MaxNumLanes = 4; @@ -1333,6 +1336,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); @@ -1344,6 +1348,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_load_tr: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1403,6 +1415,7 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, SmallVectorImpl<Value*> &Ops, Type *&AccessTy) const { switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_global_load_tr: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_append: @@ -1525,6 +1538,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // will use a MUBUF load. // FIXME?: We also need to do this if unaligned, but we don't know the // alignment here. + // TODO: Update this for GFX12 which does have scalar sub-dword loads. if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) return isLegalGlobalAddressingMode(AM); @@ -2297,7 +2311,7 @@ void SITargetLowering::allocateSpecialInputSGPRs( const Module *M = MF.getFunction().getParent(); if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) + AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2350,7 +2364,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, const Module *M = MF.getFunction().getParent(); if (UserSGPRInfo.hasQueuePtr() && - AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { + AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -2779,15 +2793,16 @@ SDValue SITargetLowering::LowerFormalArguments( } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + + // FIXME: Sink this into allocateSpecialInputSGPRs + if (!Subtarget->enableFlatScratch()) + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); - if (!IsGraphics && !Subtarget->enableFlatScratch()) { - CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, - AMDGPU::SGPR2, AMDGPU::SGPR3}, - 4); - } CCInfo.AnalyzeFormalArguments(Splits, AssignFn); } @@ -2987,13 +3002,8 @@ SDValue SITargetLowering::LowerFormalArguments( } // Start adding system SGPRs. - if (IsEntryFunc) { + if (IsEntryFunc) allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); - } else { - CCInfo.AllocateReg(Info->getScratchRSrcReg()); - if (!IsGraphics) - allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); - } auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); @@ -5720,7 +5730,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand()); if (isTypeLegal(LoadVT)) { return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, @@ -5739,8 +5749,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - unsigned CondCode = CD->getZExtValue(); + unsigned CondCode = N->getConstantOperandVal(3); if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); @@ -5774,9 +5783,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - unsigned CondCode = CD->getZExtValue(); + unsigned CondCode = N->getConstantOperandVal(3); if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); @@ -5894,6 +5902,55 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case Intrinsic::amdgcn_s_buffer_load: { + // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate + // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG + // combiner tries to merge the s_buffer_load_u8 with a sext instruction + // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with + // s_buffer_load_i8. + if (!Subtarget->hasScalarSubwordLoads()) + return; + SDValue Op = SDValue(N, 0); + SDValue Rsrc = Op.getOperand(1); + SDValue Offset = Op.getOperand(2); + SDValue CachePolicy = Op.getOperand(3); + EVT VT = Op.getValueType(); + assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n"); + SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + const DataLayout &DataLayout = DAG.getDataLayout(); + Align Alignment = + DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), Alignment); + SDValue LoadVal; + if (!Offset->isDivergent()) { + SDValue Ops[] = {Rsrc, // source register + Offset, CachePolicy}; + SDValue BufferLoad = + DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL, + DAG.getVTList(MVT::i32), Ops, VT, MMO); + LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); + } else { + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + CachePolicy, // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen + }; + setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); + LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); + } + Results.push_back(LoadVal); + return; + } } break; } @@ -6390,7 +6447,7 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr( SDValue QueuePtr; // For code object version 5, QueuePtr is passed through implicit kernarg. const Module *M = DAG.getMachineFunction().getFunction().getParent(); - if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { + if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { QueuePtr = loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); } else { @@ -6494,7 +6551,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // For code object version 5, private_base and shared_base are passed through // implicit kernargs. const Module *M = DAG.getMachineFunction().getFunction().getParent(); - if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { + if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { ImplicitParameter Param = (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); @@ -7248,17 +7305,17 @@ static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, // Re-construct the required return value for a image load intrinsic. // This is more complicated due to the optional use TexFailCtrl which means the required // return type is an aggregate -static SDValue constructRetValue(SelectionDAG &DAG, - MachineSDNode *Result, - ArrayRef<EVT> ResultTypes, - bool IsTexFail, bool Unpacked, bool IsD16, - int DMaskPop, int NumVDataDwords, +static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, + ArrayRef<EVT> ResultTypes, bool IsTexFail, + bool Unpacked, bool IsD16, int DMaskPop, + int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ? - ReqRetNumElts : (ReqRetNumElts + 1) / 2; + int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) + ? (ReqRetNumElts + 1) / 2 + : ReqRetNumElts; int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? DMaskPop : (DMaskPop + 1) / 2; @@ -7283,7 +7340,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, } } - if (DataDwordVT.isVector()) + if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, NumDataDwords - MaskPopDwords); @@ -7390,6 +7447,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue VData; int NumVDataDwords; bool AdjustRetType = false; + bool IsAtomicPacked16Bit = false; // Offset of intrinsic arguments const unsigned ArgOffset = WithChain ? 2 : 1; @@ -7400,6 +7458,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (BaseOpcode->Atomic) { VData = Op.getOperand(2); + IsAtomicPacked16Bit = + (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { SDValue VData2 = Op.getOperand(3); @@ -7416,9 +7478,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, NumVDataDwords = Is64Bit ? 2 : 1; } } else { - auto *DMaskConst = - cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex)); - DMask = DMaskConst->getZExtValue(); + DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex); DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); if (BaseOpcode->Store) { @@ -7639,7 +7699,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) + if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | + AMDGPU::CPol::VOLATILE)) return Op; SmallVector<SDValue, 26> Ops; @@ -7729,10 +7790,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } if (BaseOpcode->Store) return SDValue(NewNode, 0); - return constructRetValue(DAG, NewNode, - OrigResultTypes, IsTexFail, - Subtarget->hasUnpackedD16VMem(), IsD16, - DMaskLanes, NumVDataDwords, DL); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, + NumVDataDwords, IsAtomicPacked16Bit, DL); } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, @@ -7751,11 +7811,18 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, VT.getStoreSize(), Alignment); if (!Offset->isDivergent()) { - SDValue Ops[] = { - Rsrc, - Offset, // Offset - CachePolicy - }; + SDValue Ops[] = {Rsrc, Offset, CachePolicy}; + + // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the + // s_buffer_load_u16 instruction is emitted for both signed and unsigned + // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext + // and generates s_buffer_load_i16 (performSignExtendInRegCombine). + if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { + SDValue BufferLoad = + DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL, + DAG.getVTList(MVT::i32), Ops, VT, MMO); + return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); + } // Widen vec3 load to vec4. if (VT.isVector() && VT.getVectorNumElements() == 3 && @@ -7776,6 +7843,21 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, // We have a divergent offset. Emit a MUBUF buffer load instead. We can // assume that the buffer is unswizzled. + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + CachePolicy, // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen + }; + if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { + setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); + return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); + } + SmallVector<SDValue, 4> Loads; unsigned NumLoads = 1; MVT LoadVT = VT.getSimpleVT(); @@ -7789,16 +7871,6 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, } SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); - SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - CachePolicy, // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen - }; // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. @@ -8005,6 +8077,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { unsigned CPol = Op.getConstantOperandVal(3); + // s_buffer_load, because of how it's optimized, can't be volatile + // so reject ones with the volatile bit set. if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) @@ -8374,9 +8448,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M, DAG, Ops); // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, + M->getMemOperand()); return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand(), DAG); @@ -8592,9 +8666,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); @@ -8643,6 +8723,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_swap: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, @@ -8684,6 +8767,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_dec: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = Op.getConstantOperandVal(7); @@ -9376,6 +9462,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { + assert(!AMDGPU::isGFX12Plus(*Subtarget)); unsigned Opc; bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || @@ -9428,8 +9515,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); Ops.push_back( DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol - Ops.push_back( - DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz + Ops.push_back(DAG.getTargetConstant( + Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue @@ -9766,18 +9853,17 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, } // Handle 8 bit and 16 bit buffer loads -SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, - EVT LoadVT, SDLoc DL, - ArrayRef<SDValue> Ops, - MemSDNode *M) const { +SDValue +SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, + SDLoc DL, ArrayRef<SDValue> Ops, + MachineMemOperand *MMO) const { EVT IntVT = LoadVT.changeTypeToInteger(); unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); - SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, - Ops, IntVT, - M->getMemOperand()); + SDValue BufferLoad = + DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO); SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); @@ -9821,6 +9907,8 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, llvm_unreachable("invalid ext type"); } +// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. +// TODO: Skip this on GFX12 which does have scalar sub-dword loads. SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; if (Ld->getAlign() < Align(4) || Ld->isDivergent()) @@ -12058,17 +12146,42 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, return SDValue(); } -SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, - DAGCombinerInfo &DCI) - const { +SDValue +SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SDValue Src = N->getOperand(0); auto *VTSign = cast<VTSDNode>(N->getOperand(1)); - if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && - VTSign->getVT() == MVT::i8) || - (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && - VTSign->getVT() == MVT::i16)) && - Src.hasOneUse()) { + // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them + // with s_buffer_load_i8 and s_buffer_load_i16 respectively. + if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16))) { + assert(Subtarget->hasScalarSubwordLoads() && + "s_buffer_load_{u8, i8} are supported " + "in GFX12 (or newer) architectures."); + EVT VT = Src.getValueType(); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) + ? AMDGPUISD::SBUFFER_LOAD_BYTE + : AMDGPUISD::SBUFFER_LOAD_SHORT; + SDLoc DL(N); + SDVTList ResList = DCI.DAG.getVTList(MVT::i32); + SDValue Ops[] = { + Src.getOperand(0), // source register + Src.getOperand(1), // offset + Src.getOperand(2) // cachePolicy + }; + auto *M = cast<MemSDNode>(Src); + SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( + Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); + SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); + return LoadVal; + } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { auto *M = cast<MemSDNode>(Src); SDValue Ops[] = { Src.getOperand(0), // Chain @@ -14283,8 +14396,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: return performFPRoundCombine(N, DCI); case ISD::LOAD: { - if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) - return Widended; + if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI)) + return Widened; [[fallthrough]]; } default: { @@ -15483,6 +15596,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: + case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: // Target-specific read-modify-write atomics are sources of divergence. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h index 92b38ebade62..d66ba0b59ba9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -273,7 +273,8 @@ private: // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, - ArrayRef<SDValue> Ops, MemSDNode *M) const; + ArrayRef<SDValue> Ops, + MachineMemOperand *MMO) const; // Handle 8 bit and 16 bit buffer stores SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 1f480c248154..6ecb1c8bf6e1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" @@ -57,7 +58,18 @@ namespace { // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. -enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; +enum InstCounterType { + LOAD_CNT = 0, // VMcnt prior to gfx12. + DS_CNT, // LKGMcnt prior to gfx12. + EXP_CNT, // + STORE_CNT, // VScnt in gfx10/gfx11. + NUM_NORMAL_INST_CNTS, + SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. + BVH_CNT, // gfx12+ only. + KM_CNT, // gfx12+ only. + NUM_EXTENDED_INST_CNTS, + NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS +}; } // namespace namespace llvm { @@ -67,15 +79,23 @@ template <> struct enum_iteration_traits<InstCounterType> { } // namespace llvm namespace { -auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } +// Return an iterator over all counters between LOAD_CNT (the first counter) +// and \c MaxCounter (exclusive, default value yields an enumeration over +// all counters). +auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { + return enum_seq(LOAD_CNT, MaxCounter); +} using RegInterval = std::pair<int, int>; struct HardwareLimits { - unsigned VmcntMax; + unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. unsigned ExpcntMax; - unsigned LgkmcntMax; - unsigned VscntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. }; struct RegisterEncoding { @@ -86,31 +106,25 @@ struct RegisterEncoding { }; enum WaitEventType { - VMEM_ACCESS, // vector-memory read & write - VMEM_READ_ACCESS, // vector-memory read - VMEM_WRITE_ACCESS, // vector-memory write that is not scratch - SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch - LDS_ACCESS, // lds read & write - GDS_ACCESS, // gds read & write - SQ_MESSAGE, // send message - SMEM_ACCESS, // scalar-memory read & write - EXP_GPR_LOCK, // export holding on its data src - GDS_GPR_LOCK, // GDS holding on its data and addr src - EXP_POS_ACCESS, // write to export position - EXP_PARAM_ACCESS, // write to export parameter - VMW_GPR_LOCK, // vector-memory write holding on its data src - EXP_LDS_ACCESS, // read by ldsdir counting as export + VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) + VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) + VMEM_WRITE_ACCESS, // vector-memory write that is not scratch + SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + EXP_LDS_ACCESS, // read by ldsdir counting as export NUM_WAIT_EVENTS, }; -static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), - (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | - (1 << SQ_MESSAGE), - (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | - (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), - (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)}; - // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots @@ -121,8 +135,13 @@ enum RegisterMapping { SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. - NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. - EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes. + NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. + // Artificial register slots to track LDS writes into specific LDS locations + // if a location is known. When slots are exhausted or location is + // unknown use the first slot. The first slot is also always updated in + // addition to known location's slot to properly generate waits if dependent + // instruction's location is unknown. + EXTRA_VGPR_LDS = 0, NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; @@ -137,17 +156,33 @@ enum VmemType { // MIMG instructions with a sampler. VMEM_SAMPLER, // BVH instructions - VMEM_BVH + VMEM_BVH, + NUM_VMEM_TYPES }; +// Maps values of InstCounterType to the instruction that waits on that +// counter. Only used if GCNSubtarget::hasExtendedWaitCounts() +// returns true. +static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { + AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, + AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, + AMDGPU::S_WAIT_KMCNT}; + static bool updateVMCntOnly(const MachineInstr &Inst) { return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst); } +#ifndef NDEBUG +static bool isNormalMode(InstCounterType MaxCounter) { + return MaxCounter == NUM_NORMAL_INST_CNTS; +} +#endif // NDEBUG + VmemType getVmemType(const MachineInstr &Inst) { assert(updateVMCntOnly(Inst)); - if (!SIInstrInfo::isMIMG(Inst)) + if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) && + !SIInstrInfo::isVSAMPLE(Inst)) return VMEM_NOSAMPLER; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = @@ -156,25 +191,49 @@ VmemType getVmemType(const MachineInstr &Inst) { : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; } -void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { +unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { switch (T) { - case VM_CNT: - Wait.VmCnt = std::min(Wait.VmCnt, Count); - break; + case LOAD_CNT: + return Wait.LoadCnt; case EXP_CNT: - Wait.ExpCnt = std::min(Wait.ExpCnt, Count); - break; - case LGKM_CNT: - Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); - break; - case VS_CNT: - Wait.VsCnt = std::min(Wait.VsCnt, Count); - break; + return Wait.ExpCnt; + case DS_CNT: + return Wait.DsCnt; + case STORE_CNT: + return Wait.StoreCnt; + case SAMPLE_CNT: + return Wait.SampleCnt; + case BVH_CNT: + return Wait.BvhCnt; + case KM_CNT: + return Wait.KmCnt; default: llvm_unreachable("bad InstCounterType"); } } +void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { + unsigned &WC = getCounterRef(Wait, T); + WC = std::min(WC, Count); +} + +void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { + getCounterRef(Wait, T) = ~0u; +} + +unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { + return getCounterRef(Wait, T); +} + +// Mapping from event to counter according to the table masks. +InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { + for (auto T : inst_counter_types()) { + if (masks[T] & (1 << E)) + return T; + } + llvm_unreachable("event type has no associated counter"); +} + // This objects maintains the current score brackets of each wait counter, and // a per-register scoreboard for each wait counter. // @@ -185,20 +244,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, - RegisterEncoding Encoding) - : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} + WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, + HardwareLimits Limits, RegisterEncoding Encoding, + const unsigned *WaitEventMaskForInst, + InstCounterType SmemAccessCounter) + : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), + Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), + SmemAccessCounter(SmemAccessCounter) {} unsigned getWaitCountMax(InstCounterType T) const { switch (T) { - case VM_CNT: - return Limits.VmcntMax; - case LGKM_CNT: - return Limits.LgkmcntMax; + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; case EXP_CNT: return Limits.ExpcntMax; - case VS_CNT: - return Limits.VscntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; default: break; } @@ -219,20 +288,11 @@ public: return getScoreUB(T) - getScoreLB(T); } - // Mapping from event to counter. - InstCounterType eventCounter(WaitEventType E) const { - for (auto T : inst_counter_types()) { - if (WaitEventMaskForInst[T] & (1 << E)) - return T; - } - llvm_unreachable("event type has no associated counter"); - } - unsigned getRegScore(int GprNo, InstCounterType T) const { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } - assert(T == LGKM_CNT); + assert(T == SmemAccessCounter); return SgprScores[GprNo - NUM_ALL_VGPRS]; } @@ -269,15 +329,15 @@ public: } bool hasPendingFlat() const { - return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && - LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || - (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] && - LastFlat[VM_CNT] <= ScoreUBs[VM_CNT])); + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); } void setPendingFlat() { - LastFlat[VM_CNT] = ScoreUBs[VM_CNT]; - LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; } // Return true if there might be pending writes to the specified vgpr by VMEM @@ -293,8 +353,12 @@ public: } void setStateOnFunctionEntryOrReturn() { - setScoreUB(VS_CNT, getWaitCountMax(VS_CNT)); - PendingEvents |= WaitEventMaskForInst[VS_CNT]; + setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); + PendingEvents |= WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; } void print(raw_ostream &); @@ -331,7 +395,7 @@ private: VgprUB = std::max(VgprUB, GprNo); VgprScores[T][GprNo] = Val; } else { - assert(T == LGKM_CNT); + assert(T == SmemAccessCounter); SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); SgprScores[GprNo - NUM_ALL_VGPRS] = Val; } @@ -342,8 +406,11 @@ private: unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; + InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; HardwareLimits Limits = {}; RegisterEncoding Encoding = {}; + const unsigned *WaitEventMaskForInst; + InstCounterType SmemAccessCounter; unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; unsigned PendingEvents = 0; @@ -354,11 +421,134 @@ private: int VgprUB = -1; int SgprUB = -1; unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, only lgkmcnt is relevant. + // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; // Bitmask of the VmemTypes of VMEM instructions that might have a pending // write to each vgpr. unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; +}; + +// This abstracts the logic for generating and updating S_WAIT* instructions +// away from the analysis that determines where they are needed. This was +// done because the set of counters and instructions for waiting on them +// underwent a major shift with gfx12, sufficiently so that having this +// abstraction allows the main analysis logic to be simpler than it would +// otherwise have had to become. +class WaitcntGenerator { +protected: + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + AMDGPU::IsaVersion IV; + InstCounterType MaxCounter; + +public: + WaitcntGenerator() {} + WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter) + : ST(ST), TII(ST->getInstrInfo()), + IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {} + + // Edits an existing sequence of wait count instructions according + // to an incoming Waitcnt value, which is itself updated to reflect + // any new wait count instructions which may need to be generated by + // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits + // were made. + // + // This editing will usually be merely updated operands, but it may also + // delete instructions if the incoming Wait value indicates they are not + // needed. It may also remove existing instructions for which a wait + // is needed if it can be determined that it is better to generate new + // instructions later, as can happen on gfx12. + virtual bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const = 0; + + // Transform a soft waitcnt into a normal one. + bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; + + // Generates new wait count instructions according to the value of + // Wait, returning true if any new instructions were created. + virtual bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) = 0; + + // Returns an array of bit masks which can be used to map values in + // WaitEventType to corresponding counter values in InstCounterType. + virtual const unsigned *getWaitEventMask() const = 0; + + virtual ~WaitcntGenerator() = default; +}; + +class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { +public: + WaitcntGeneratorPreGFX12() {} + WaitcntGeneratorPreGFX12(const GCNSubtarget *ST) + : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {} + + bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const override; + + bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) override; + + const unsigned *getWaitEventMask() const override { + assert(ST); + + static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) | + (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | + (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), + 0, + 0, + 0}; + + return WaitEventMaskForInstPreGFX12; + } +}; + +class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { +public: + WaitcntGeneratorGFX12Plus() {} + WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter) + : WaitcntGenerator(ST, MaxCounter) {} + + bool + applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It) const override; + + bool createNewWaitcnt(MachineBasicBlock &Block, + MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) override; + + const unsigned *getWaitEventMask() const override { + assert(ST); + + static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), + (1 << LDS_ACCESS) | (1 << GDS_ACCESS), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | + (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS), + (1 << VMEM_SAMPLER_READ_ACCESS), + (1 << VMEM_BVH_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)}; + + return WaitEventMaskForInstGFX12Plus; + } }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -367,18 +557,20 @@ private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; - AMDGPU::IsaVersion IV; DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; + AliasAnalysis *AA = nullptr; struct BlockInfo { std::unique_ptr<WaitcntBrackets> Incoming; bool Dirty = true; }; + InstCounterType SmemAccessCounter; + MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 @@ -388,10 +580,20 @@ private: bool OptNone; + // In any given run of this pass, WCG will point to one of these two + // generator objects, which must have been re-initialised before use + // from a value made using a subtarget constructor. + WaitcntGeneratorPreGFX12 WCGPreGFX12; + WaitcntGeneratorGFX12Plus WCGGFX12Plus; + + WaitcntGenerator *WCG = nullptr; + // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; + InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + public: static char ID; @@ -415,6 +617,8 @@ public: AU.setPreservesCFG(); AU.addRequired<MachineLoopInfo>(); AU.addRequired<MachinePostDominatorTree>(); + AU.addUsedIfAvailable<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -438,16 +642,22 @@ public: if (DebugCounter::isCounterSet(ForceLgkmCounter) && DebugCounter::shouldExecute(ForceLgkmCounter)) { - ForceEmitWaitcnt[LGKM_CNT] = true; + ForceEmitWaitcnt[DS_CNT] = true; + ForceEmitWaitcnt[KM_CNT] = true; } else { - ForceEmitWaitcnt[LGKM_CNT] = false; + ForceEmitWaitcnt[DS_CNT] = false; + ForceEmitWaitcnt[KM_CNT] = false; } if (DebugCounter::isCounterSet(ForceVMCounter) && DebugCounter::shouldExecute(ForceVMCounter)) { - ForceEmitWaitcnt[VM_CNT] = true; + ForceEmitWaitcnt[LOAD_CNT] = true; + ForceEmitWaitcnt[SAMPLE_CNT] = true; + ForceEmitWaitcnt[BVH_CNT] = true; } else { - ForceEmitWaitcnt[VM_CNT] = false; + ForceEmitWaitcnt[LOAD_CNT] = false; + ForceEmitWaitcnt[SAMPLE_CNT] = false; + ForceEmitWaitcnt[BVH_CNT] = false; } #endif // NDEBUG } @@ -455,6 +665,10 @@ public: // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or // FLAT instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { + // Maps VMEM access types to their corresponding WaitEventType. + static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { + VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; + assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); // LDS DMA loads are also stores, but on the LDS side. On the VMEM side // these should use VM_CNT. @@ -467,7 +681,9 @@ public: return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } - return VMEM_READ_ACCESS; + if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst)) + return VMEM_READ_ACCESS; + return VmemReadMapping[getVmemType(Inst)]; } bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; @@ -488,13 +704,6 @@ public: WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); - bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, - MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, - MachineBasicBlock::instr_iterator It) const; - - // Transform a soft waitcnt into a normal one. - bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; }; } // end anonymous namespace @@ -556,8 +765,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(E); - unsigned CurrScore = getScoreUB(T) + 1; + InstCounterType T = eventCounter(WaitEventMaskForInst, E); + + unsigned UB = getScoreUB(T); + unsigned CurrScore = UB + 1; if (CurrScore == 0) report_fatal_error("InsertWaitcnt score wraparound"); // PendingEvents and ScoreUB need to be update regardless if this event @@ -686,7 +897,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, if (!Op.isReg() || !Op.isDef()) continue; RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); - if (T == VM_CNT) { + if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { if (Interval.first >= NUM_ALL_VGPRS) continue; if (updateVMCntOnly(Inst)) { @@ -707,28 +918,73 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS // written can be accessed. A load from LDS to VMEM does not need a wait. - setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); + unsigned Slot = 0; + for (const auto *MemOp : Inst.memoperands()) { + if (!MemOp->isStore() || + MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + // Comparing just AA info does not guarantee memoperands are equal + // in general, but this is so for LDS DMA in practice. + auto AAI = MemOp->getAAInfo(); + // Alias scope information gives a way to definitely identify an + // original memory object and practically produced in the module LDS + // lowering pass. If there is no scope available we will not be able + // to disambiguate LDS aliasing as after the module lowering all LDS + // is squashed into a single big object. Do not attempt to use one of + // the limited LDSDMAStores for something we will not be able to use + // anyway. + if (!AAI || !AAI.Scope) + break; + for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { + for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { + if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { + Slot = I + 1; + break; + } + } + } + if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) + break; + LDSDMAStores.push_back(&Inst); + Slot = LDSDMAStores.size(); + break; + } + setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore); + if (Slot) + setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); } } } void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; - for (auto T : inst_counter_types()) { + for (auto T : inst_counter_types(MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { - case VM_CNT: - OS << " VM_CNT(" << SR << "): "; + case LOAD_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" + << SR << "): "; break; - case LGKM_CNT: - OS << " LGKM_CNT(" << SR << "): "; + case DS_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT(" + << SR << "): "; break; case EXP_CNT: OS << " EXP_CNT(" << SR << "): "; break; - case VS_CNT: - OS << " VS_CNT(" << SR << "): "; + case STORE_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT(" + << SR << "): "; + break; + case SAMPLE_CNT: + OS << " SAMPLE_CNT(" << SR << "): "; + break; + case BVH_CNT: + OS << " BVH_CNT(" << SR << "): "; + break; + case KM_CNT: + OS << " KM_CNT(" << SR << "): "; break; default: OS << " UNKNOWN(" << SR << "): "; @@ -751,9 +1007,9 @@ void WaitcntBrackets::print(raw_ostream &OS) { } } // Also need to print sgpr scores for lgkm_cnt. - if (T == LGKM_CNT) { + if (T == SmemAccessCounter) { for (int J = 0; J <= SgprUB; J++) { - unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; @@ -769,10 +1025,13 @@ void WaitcntBrackets::print(raw_ostream &OS) { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - simplifyWaitcnt(VM_CNT, Wait.VmCnt); + simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); - simplifyWaitcnt(VS_CNT, Wait.VsCnt); + simplifyWaitcnt(DS_CNT, Wait.DsCnt); + simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); + simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); + simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); + simplifyWaitcnt(KM_CNT, Wait.KmCnt); } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, @@ -793,8 +1052,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if ((T == VM_CNT || T == LGKM_CNT) && - hasPendingFlat() && + if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && !ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need @@ -815,10 +1073,13 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, } void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { - applyWaitcnt(VM_CNT, Wait.VmCnt); + applyWaitcnt(LOAD_CNT, Wait.LoadCnt); applyWaitcnt(EXP_CNT, Wait.ExpCnt); - applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); - applyWaitcnt(VS_CNT, Wait.VsCnt); + applyWaitcnt(DS_CNT, Wait.DsCnt); + applyWaitcnt(STORE_CNT, Wait.StoreCnt); + applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); + applyWaitcnt(BVH_CNT, Wait.BvhCnt); + applyWaitcnt(KM_CNT, Wait.KmCnt); } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { @@ -839,7 +1100,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) + if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) return true; return hasMixedPendingEvents(T); } @@ -873,22 +1134,49 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, return true; } -bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { - unsigned Opcode = Waitcnt->getOpcode(); - if (!SIInstrInfo::isSoftWaitcnt(Opcode)) +/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, +/// and if so, which counter it is waiting on. +static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_WAIT_LOADCNT: + return LOAD_CNT; + case AMDGPU::S_WAIT_EXPCNT: + return EXP_CNT; + case AMDGPU::S_WAIT_STORECNT: + return STORE_CNT; + case AMDGPU::S_WAIT_SAMPLECNT: + return SAMPLE_CNT; + case AMDGPU::S_WAIT_BVHCNT: + return BVH_CNT; + case AMDGPU::S_WAIT_DSCNT: + return DS_CNT; + case AMDGPU::S_WAIT_KMCNT: + return KM_CNT; + default: + return {}; + } +} + +bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode()); + if (Opcode == Waitcnt->getOpcode()) return false; - Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); + Waitcnt->setDesc(TII->get(Opcode)); return true; } -/// Combine consecutive waitcnt instructions that precede \p It and follow -/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added -/// by previous passes. Currently this pass conservatively assumes that these -/// preexisting waitcnt are required for correctness. -bool SIInsertWaitcnts::applyPreexistingWaitcnt( +/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that +/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits +/// from \p Wait that were added by previous passes. Currently this pass +/// conservatively assumes that these preexisting waits are required for +/// correctness. +bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { + assert(ST); + assert(isNormalMode(MaxCounter)); + bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; @@ -898,12 +1186,12 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (II.isMetaInstruction()) continue; - unsigned Opcode = II.getOpcode(); - bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode); + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); + bool IsSoft = Opcode != II.getOpcode(); - if (SIInstrInfo::isWaitcnt(Opcode)) { - // Update required wait count. If this is a soft waitcnt (= it was added - // by an earlier pass), it may be entirely removed. + // Update required wait count. If this is a soft waitcnt (= it was added + // by an earlier pass), it may be entirely removed. + if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); if (IsSoft) @@ -911,23 +1199,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( Wait = Wait.combined(OldWait); // Merge consecutive waitcnt of the same type by erasing multiples. - if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) { + if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; } else WaitcntInstr = &II; - } else { - assert(SIInstrInfo::isWaitcntVsCnt(Opcode)); + assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); unsigned OldVSCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); if (IsSoft) - ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt); - Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); + ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); + Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); - if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) { + if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; } else @@ -935,18 +1222,19 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( } } - // Updated encoding of merged waitcnt with the required wait. if (WaitcntInstr) { Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, AMDGPU::encodeWaitcnt(IV, Wait)); Modified |= promoteSoftWaitCnt(WaitcntInstr); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VmCnt = ~0u; - Wait.LgkmCnt = ~0u; + ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); + ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.LoadCnt = ~0u; Wait.ExpCnt = ~0u; + Wait.DsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + LLVM_DEBUG(It == WaitcntInstr->getParent()->end() ? dbgs() << "applyPreexistingWaitcnt\n" << "New Instr at block end: " << *WaitcntInstr << '\n' @@ -957,12 +1245,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (WaitcntVsCntInstr) { Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, - AMDGPU::OpName::simm16, Wait.VsCnt); + AMDGPU::OpName::simm16, Wait.StoreCnt); Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VsCnt = ~0u; - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); + Wait.StoreCnt = ~0u; + + LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() ? dbgs() << "applyPreexistingWaitcnt\n" << "New Instr at block end: " << *WaitcntVsCntInstr << '\n' @@ -974,6 +1263,293 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( return Modified; } +/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any +/// required counters in \p Wait +bool WaitcntGeneratorPreGFX12::createNewWaitcnt( + MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) { + assert(ST); + assert(isNormalMode(MaxCounter)); + + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a + // single instruction while VScnt has its own instruction. + if (Wait.hasWaitExceptStoreCnt()) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + if (Wait.hasWaitStoreCnt()) { + assert(ST->hasVscnt()); + + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + +/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and +/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that +/// were added by previous passes. Currently this pass conservatively +/// assumes that these preexisting waits are required for correctness. +bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( + WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { + assert(ST); + assert(!isNormalMode(MaxCounter)); + + bool Modified = false; + MachineInstr *CombinedLoadDsCntInstr = nullptr; + MachineInstr *CombinedStoreDsCntInstr = nullptr; + MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; + + for (auto &II : + make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { + if (II.isMetaInstruction()) + continue; + + MachineInstr **UpdatableInstr; + + // Update required wait count. If this is a soft waitcnt (= it was added + // by an earlier pass), it may be entirely removed. + + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); + bool IsSoft = Opcode != II.getOpcode(); + + if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { + unsigned OldEnc = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait); + Wait = Wait.combined(OldWait); + UpdatableInstr = &CombinedLoadDsCntInstr; + } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { + unsigned OldEnc = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait); + Wait = Wait.combined(OldWait); + UpdatableInstr = &CombinedStoreDsCntInstr; + } else { + std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); + assert(CT.has_value()); + unsigned OldCnt = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); + addWait(Wait, CT.value(), OldCnt); + UpdatableInstr = &WaitInstrs[CT.value()]; + } + + // Merge consecutive waitcnt of the same type by erasing multiples. + if (!*UpdatableInstr) { + *UpdatableInstr = &II; + } else { + II.eraseFromParent(); + Modified = true; + } + } + + if (CombinedLoadDsCntInstr) { + // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need + // to be waited for. Otherwise, let the instruction be deleted so + // the appropriate single counter wait instruction can be inserted + // instead, when new S_WAIT_*CNT instructions are inserted by + // createNewWaitcnt(). As a side effect, resetting the wait counts will + // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by + // the loop below that deals with single counter instructions. + if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { + unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait); + Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, + AMDGPU::OpName::simm16, NewEnc); + Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr); + ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.LoadCnt = ~0u; + Wait.DsCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *CombinedLoadDsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedLoadDsCntInstr << '\n'); + } else { + CombinedLoadDsCntInstr->eraseFromParent(); + Modified = true; + } + } + + if (CombinedStoreDsCntInstr) { + // Similarly for S_WAIT_STORECNT_DSCNT. + if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { + unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait); + Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, + AMDGPU::OpName::simm16, NewEnc); + Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr); + ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt); + ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt); + Wait.StoreCnt = ~0u; + Wait.DsCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *CombinedStoreDsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It << "New Instr: " + << *CombinedStoreDsCntInstr << '\n'); + } else { + CombinedStoreDsCntInstr->eraseFromParent(); + Modified = true; + } + } + + // Look for an opportunity to convert existing S_WAIT_LOADCNT, + // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT + // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing + // instructions so that createNewWaitcnt() will create new combined + // instructions to replace them. + + if (Wait.DsCnt != ~0u) { + // This is a vector of addresses in WaitInstrs pointing to instructions + // that should be removed if they are present. + SmallVector<MachineInstr **, 2> WaitsToErase; + + // If it's known that both DScnt and either LOADcnt or STOREcnt (but not + // both) need to be waited for, ensure that there are no existing + // individual wait count instructions for these. + + if (Wait.LoadCnt != ~0u) { + WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]); + WaitsToErase.push_back(&WaitInstrs[DS_CNT]); + } else if (Wait.StoreCnt != ~0u) { + WaitsToErase.push_back(&WaitInstrs[STORE_CNT]); + WaitsToErase.push_back(&WaitInstrs[DS_CNT]); + } + + for (MachineInstr **WI : WaitsToErase) { + if (!*WI) + continue; + + (*WI)->eraseFromParent(); + *WI = nullptr; + Modified = true; + } + } + + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if (!WaitInstrs[CT]) + continue; + + unsigned NewCnt = getWait(Wait, CT); + if (NewCnt != ~0u) { + Modified |= updateOperandIfDifferent(*WaitInstrs[CT], + AMDGPU::OpName::simm16, NewCnt); + Modified |= promoteSoftWaitCnt(WaitInstrs[CT]); + + ScoreBrackets.applyWaitcnt(CT, NewCnt); + setNoWait(Wait, CT); + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitInstrs[CT] + << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitInstrs[CT] << '\n'); + } else { + WaitInstrs[CT]->eraseFromParent(); + Modified = true; + } + } + + return Modified; +} + +/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait +bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( + MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, + AMDGPU::Waitcnt Wait) { + assert(ST); + assert(!isNormalMode(MaxCounter)); + + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + // Check for opportunities to use combined wait instructions. + if (Wait.DsCnt != ~0u) { + MachineInstr *SWaitInst = nullptr; + + if (Wait.LoadCnt != ~0u) { + unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait); + + SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) + .addImm(Enc); + + Wait.LoadCnt = ~0u; + Wait.DsCnt = ~0u; + } else if (Wait.StoreCnt != ~0u) { + unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait); + + SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) + .addImm(Enc); + + Wait.StoreCnt = ~0u; + Wait.DsCnt = ~0u; + } + + if (SWaitInst) { + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + } + + // Generate an instruction for any remaining counter that needs + // waiting for. + + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = getWait(Wait, CT); + if (Count == ~0u) + continue; + + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(Count); + + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + static bool readsVCCZ(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && @@ -1027,7 +1603,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { - Wait.VmCnt = 0; + Wait.LoadCnt = 0; } // All waits must be resolved at call return. @@ -1037,16 +1613,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + Wait = Wait.combined( + AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); } // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly // release all VGPRs before the stores have completed, but it is only safe to - // do this if there are no outstanding scratch stores. + // do this if: + // * there are no outstanding scratch stores + // * we are not in Dynamic VGPR mode else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && - ScoreBrackets.getScoreRange(VS_CNT) != 0 && + ScoreBrackets.getScoreRange(STORE_CNT) != 0 && !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) ReleaseVGPRInsts.insert(&MI); } @@ -1056,7 +1635,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ST->hasLegacyGeometry() && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { - Wait.VmCnt = 0; + Wait.LoadCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. else if (MI.getOpcode() == SC_FENCE) { @@ -1073,12 +1652,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, case SCMEM_LDS: if (group_is_multi_wave || context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { - EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); - // LDS may have to wait for VM_CNT after buffer load to LDS + EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, + ScoreBrackets->getScoreUB(DS_CNT)); + // LDS may have to wait for VMcnt after buffer load to LDS if (target_info->HasBufferLoadToLDS()) { - EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } } break; @@ -1087,8 +1666,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (group_is_multi_wave || fence_is_global) { EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, + ScoreBrackets->getScoreUB(DS_CNT)); } break; @@ -1099,8 +1678,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (group_is_multi_wave || fence_is_global) { EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } break; @@ -1143,7 +1722,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); @@ -1153,7 +1732,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); } } } else { @@ -1170,10 +1749,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // instruction to guarantee the right WAW order. // 2) If a destination operand that was used by a recent export/store ins, // add s_waitcnt on exp_cnt to guarantee the WAR order. + for (const MachineMemOperand *Memop : MI.memoperands()) { const Value *Ptr = Memop->getValue(); if (Memop->isStore() && SLoadAddresses.count(Ptr)) { - addWait(Wait, LGKM_CNT, 0); + addWait(Wait, SmemAccessCounter, 0); if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) SLoadAddresses.erase(Ptr); } @@ -1183,9 +1763,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // No need to wait before load from VMEM to LDS. if (TII->mayWriteLDSThroughDMA(MI)) continue; + + // LOAD_CNT is only relevant to vgpr or LDS. unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); + bool FoundAliasingStore = false; + // Only objects with alias scope info were added to LDSDMAScopes array. + // In the absense of the scope info we will not be able to disambiguate + // aliasing here. There is no need to try searching for a corresponding + // store slot. This is conservatively correct because in that case we + // will produce a wait using the first (general) LDS DMA wait slot which + // will wait on all of them anyway. + if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { + const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); + for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { + if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { + FoundAliasingStore = true; + ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + } + } + } + if (!FoundAliasingStore) + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); if (Memop->isStore()) { ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } @@ -1213,14 +1811,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isUse() || !updateVMCntOnly(MI) || ScoreBrackets.hasOtherPendingVmemTypes(RegNo, getVmemType(MI))) { - ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); + ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait); + ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } + ScoreBrackets.determineWait(DS_CNT, RegNo, Wait); + } else { + ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait); } - ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait); } } } @@ -1232,7 +1834,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); + Wait = Wait.combined( + AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1240,7 +1843,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // independent of target. if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - Wait.LgkmCnt = 0; + Wait.DsCnt = 0; } } @@ -1248,35 +1851,54 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(); + Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()); - if (ForceEmitWaitcnt[VM_CNT]) - Wait.VmCnt = 0; + if (ForceEmitWaitcnt[LOAD_CNT]) + Wait.LoadCnt = 0; if (ForceEmitWaitcnt[EXP_CNT]) Wait.ExpCnt = 0; - if (ForceEmitWaitcnt[LGKM_CNT]) - Wait.LgkmCnt = 0; + if (ForceEmitWaitcnt[DS_CNT]) + Wait.DsCnt = 0; + if (ForceEmitWaitcnt[SAMPLE_CNT]) + Wait.SampleCnt = 0; + if (ForceEmitWaitcnt[BVH_CNT]) + Wait.BvhCnt = 0; + if (ForceEmitWaitcnt[KM_CNT]) + Wait.KmCnt = 0; if (FlushVmCnt) { - if (ScoreBrackets.hasPendingEvent(VM_CNT)) - Wait.VmCnt = 0; + if (ScoreBrackets.hasPendingEvent(LOAD_CNT)) + Wait.LoadCnt = 0; + if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT)) + Wait.SampleCnt = 0; + if (ScoreBrackets.hasPendingEvent(BVH_CNT)) + Wait.BvhCnt = 0; } return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, OldWaitcntInstr); } -// Add a waitcnt to flush the vmcnt counter at the end of the given block if -// needed. +// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the +// end of the given block if needed. bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { AMDGPU::Waitcnt Wait; - if (!ScoreBrackets.hasPendingEvent(VM_CNT)) + unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT); + unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT); + unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT); + + if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0) return false; - Wait.VmCnt = 0; + if (LoadCntPending != 0) + Wait.LoadCnt = 0; + if (SampleCntPending != 0) + Wait.SampleCnt = 0; + if (BvhCntPending != 0) + Wait.BvhCnt = 0; return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, OldWaitcntInstr); @@ -1288,15 +1910,16 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { bool Modified = false; - const DebugLoc &DL = Block.findDebugLoc(It); if (OldWaitcntInstr) // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. Modified = - applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - else - ScoreBrackets.applyWaitcnt(Wait); + WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); + + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(Wait); // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -1309,35 +1932,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, } Wait.ExpCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + LLVM_DEBUG(dbgs() << "generateWaitcnt\n" << "Update Instr: " << *It); } - // Build new waitcnt instructions unless no wait is needed or the old waitcnt - // instruction was modified to handle the required wait. - if (Wait.hasWaitExceptVsCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } - - if (Wait.hasWaitVsCnt()) { - assert(ST->hasVscnt()); - - [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.VsCnt); - Modified = true; - - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } return Modified; } @@ -1435,7 +2036,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. - // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. + // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. + if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { if (TII->isAlwaysGDS(Inst.getOpcode()) || TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { @@ -1486,7 +2088,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + ScoreBrackets->applyWaitcnt( + AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts())); ScoreBrackets->setStateOnFunctionEntryOrReturn(); } else { // May need to way wait for anything. @@ -1546,7 +2149,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types()) { + for (auto T : inst_counter_types(MaxCounter)) { // Merge event flags for this counter const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; @@ -1574,7 +2177,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (int J = 0; J <= VgprUB; J++) StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); - if (T == LGKM_CNT) { + if (T == SmemAccessCounter) { for (int J = 0; J <= SgprUB; J++) StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } @@ -1590,10 +2193,13 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } static bool isWaitInstr(MachineInstr &Inst) { - auto Opcode = Inst.getOpcode(); - return SIInstrInfo::isWaitcnt(Opcode) || - (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() && - Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL); + unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode()); + return Opcode == AMDGPU::S_WAITCNT || + (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || + Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || + Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || + counterTypeForInstr(Opcode).has_value(); } // Generate s_waitcnt instructions where needed. @@ -1699,8 +2305,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // an S_WAITCNT vmcnt(0) if (RequireCheckResourceType(Inst, context)) { // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. - ScoreBrackets->setScoreLB(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->setScoreLB(LOAD_CNT, + ScoreBrackets->getScoreUB(LOAD_CNT)); } #endif @@ -1802,7 +2408,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, VgprUse.insert(RegNo); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) { + if (Brackets.getRegScore(RegNo, LOAD_CNT) > + Brackets.getScoreLB(LOAD_CNT) || + Brackets.getRegScore(RegNo, SAMPLE_CNT) > + Brackets.getScoreLB(SAMPLE_CNT) || + Brackets.getRegScore(RegNo, BVH_CNT) > + Brackets.getScoreLB(BVH_CNT)) { UsesVgprLoadedOutside = true; break; } @@ -1830,23 +2441,48 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MLI = &getAnalysis<MachineLoopInfo>(); PDT = &getAnalysis<MachinePostDominatorTree>(); + if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) + AA = &AAR->getAAResults(); + + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + + if (ST->hasExtendedWaitCounts()) { + MaxCounter = NUM_EXTENDED_INST_CNTS; + WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter); + WCG = &WCGGFX12Plus; + } else { + MaxCounter = NUM_NORMAL_INST_CNTS; + WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST); + WCG = &WCGPreGFX12; + } ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; + const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + + SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); + OptNone = MF.getFunction().hasOptNone() || MF.getTarget().getOptLevel() == CodeGenOptLevel::None; HardwareLimits Limits = {}; - Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + if (ST->hasExtendedWaitCounts()) { + Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); + Limits.DscntMax = AMDGPU::getDscntBitMask(IV); + } else { + Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); + Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); + } Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); - Limits.VscntMax = ST->hasVscnt() ? 63 : 0; + Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); + Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); + Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); + Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); @@ -1864,6 +2500,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockInfos.clear(); bool Modified = false; + MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may // depend on. We can't track them and it's better to do the wait after the @@ -1871,15 +2510,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); for (MachineBasicBlock::iterator E = EntryBB.end(); I != E && (I->isPHI() || I->isMetaInstruction()); ++I) ; - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - auto NonKernelInitialState = - std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); + if (ST->hasExtendedWaitCounts()) { + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) + .addImm(0); + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) + continue; + + BuildMI(EntryBB, I, DebugLoc(), + TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(0); + } + } else { + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + } + + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( + ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, + SmemAccessCounter); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -1910,9 +2562,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); + Brackets = std::make_unique<WaitcntBrackets>( + ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst, + SmemAccessCounter); else - *Brackets = WaitcntBrackets(ST, Limits, Encoding); + *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, + WaitEventMaskForInst, SmemAccessCounter); } Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e50f5f28e030..f4ca27808a30 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2410,13 +2410,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // the encoding of $symbol starts 12 bytes after the start of the s_add_u32 // instruction. + int64_t Adjust = 0; + if (ST.hasGetPCZeroExtension()) { + // Fix up hardware that does not sign-extend the 48-bit PC value by + // inserting: s_sext_i32_i16 reghi, reghi + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi)); + Adjust += 4; + } + if (OpLo.isGlobal()) - OpLo.setOffset(OpLo.getOffset() + 4); + OpLo.setOffset(OpLo.getOffset() + Adjust + 4); Bundler.append( BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo)); if (OpHi.isGlobal()) - OpHi.setOffset(OpHi.getOffset() + 12); + OpHi.setOffset(OpHi.getOffset() + Adjust + 12); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .add(OpHi)); @@ -2480,6 +2489,19 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_MUL_I64_I32_PSEUDO: MI.setDesc(get(AMDGPU::S_MUL_U64)); break; + + case AMDGPU::S_GETPC_B64_pseudo: + MI.setDesc(get(AMDGPU::S_GETPC_B64)); + if (ST.hasGetPCZeroExtension()) { + Register Dst = MI.getOperand(0).getReg(); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + // Fix up hardware that does not sign-extend the 48-bit PC value by + // inserting: s_sext_i32_i16 dsthi, dsthi + BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16), + DstHi) + .addReg(DstHi); + } + break; } return true; } @@ -5280,7 +5302,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64 : AMDGPU::V_CEIL_F16_fake16_e64; case AMDGPU::S_FLOOR_F16: - return AMDGPU::V_FLOOR_F16_fake16_e64; + return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64 + : AMDGPU::V_FLOOR_F16_fake16_e64; case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_fake16_e64; case AMDGPU::S_RNDNE_F16: @@ -8756,6 +8779,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = { {MONoClobber, "amdgpu-noclobber"}, + {MOLastUse, "amdgpu-last-use"}, }; return ArrayRef(TargetFlags); @@ -8944,8 +8968,9 @@ bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, // Depending on the used address space and instructions, some immediate offsets // are allowed and some are not. -// In general, flat instruction offsets can only be non-negative, global and -// scratch instruction offsets can also be negative. +// Pre-GFX12, flat instruction offsets can only be non-negative, global and +// scratch instruction offsets can also be negative. On GFX12, offsets can be +// negative for all variants. // // There are several bugs related to these offsets: // On gfx10.1, flat instructions that go into the global address space cannot @@ -9076,8 +9101,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { - if (SIInstrInfo::isSoftWaitcnt(Opcode)) - Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); + Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); unsigned Gen = subtargetEncodingFamily(ST); @@ -9113,12 +9137,6 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); - // TODO-GFX12: Remove this. - // Hack to allow some GFX12 codegen tests to run before all the encodings are - // implemented. - if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12) - MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11); - // -1 means that Opcode is already a native instruction. if (MCOp == -1) return Opcode; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 37ee159362a2..fc85b089aa47 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -41,6 +41,10 @@ class ScheduleHazardRecognizer; static const MachineMemOperand::Flags MONoClobber = MachineMemOperand::MOTargetFlag1; +/// Mark the MMO of a load as the last use. +static const MachineMemOperand::Flags MOLastUse = + MachineMemOperand::MOTargetFlag2; + /// Utility to store machine instructions worklist. struct SIInstrWorklist { SIInstrWorklist() = default; @@ -905,29 +909,24 @@ public: } static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { - if (isWaitcnt(Opcode)) + switch (Opcode) { + case AMDGPU::S_WAITCNT_soft: return AMDGPU::S_WAITCNT; - - if (isWaitcntVsCnt(Opcode)) + case AMDGPU::S_WAITCNT_VSCNT_soft: return AMDGPU::S_WAITCNT_VSCNT; - - llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT"); - } - - static bool isWaitcnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft; - } - - static bool isWaitcntVsCnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT_VSCNT || - Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; - } - - // "Soft" waitcnt instructions can be relaxed/optimized out by - // SIInsertWaitcnts. - static bool isSoftWaitcnt(unsigned Opcode) { - return Opcode == AMDGPU::S_WAITCNT_soft || - Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; + case AMDGPU::S_WAIT_LOADCNT_soft: + return AMDGPU::S_WAIT_LOADCNT; + case AMDGPU::S_WAIT_STORECNT_soft: + return AMDGPU::S_WAIT_STORECNT; + case AMDGPU::S_WAIT_SAMPLECNT_soft: + return AMDGPU::S_WAIT_SAMPLECNT; + case AMDGPU::S_WAIT_BVHCNT_soft: + return AMDGPU::S_WAIT_BVHCNT; + case AMDGPU::S_WAIT_DSCNT_soft: + return AMDGPU::S_WAIT_DSCNT; + default: + return Opcode; + } } bool isVGPRCopy(const MachineInstr &MI) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 04c92155f5aa..a6820544f4b4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -41,10 +41,29 @@ def SIEncodingFamily { def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; -def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", - SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, - [SDNPMayLoad, SDNPMemOperand] ->; +def SDTSBufferLoad : SDTypeProfile<1, 3, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // offset(imm) + SDTCisVT<3, i32>]>; // cachepolicy + +def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", SDTSBufferLoad, + [SDNPMayLoad, SDNPMemOperand]>; + +def SIsbuffer_load_byte : SDNode<"AMDGPUISD::SBUFFER_LOAD_BYTE", SDTSBufferLoad, + [SDNPMayLoad, SDNPMemOperand]>; + +def SIsbuffer_load_ubyte + : SDNode<"AMDGPUISD::SBUFFER_LOAD_UBYTE", SDTSBufferLoad, + [SDNPMayLoad, SDNPMemOperand]>; + +def SIsbuffer_load_short + : SDNode<"AMDGPUISD::SBUFFER_LOAD_SHORT", SDTSBufferLoad, + [SDNPMayLoad, SDNPMemOperand]>; + +def SIsbuffer_load_ushort + : SDNode<"AMDGPUISD::SBUFFER_LOAD_USHORT", SDTSBufferLoad, + [SDNPMayLoad, SDNPMemOperand]>; def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>, @@ -195,8 +214,10 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; +defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -281,49 +302,10 @@ def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD", // ValueType helpers //===----------------------------------------------------------------------===// -// Returns 1 if the source arguments have modifiers, 0 if they do not. -class isFloatType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, f16.Value), - !eq(SrcVT.Value, f32.Value), - !eq(SrcVT.Value, f64.Value), - !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v4f16.Value), - !eq(SrcVT.Value, v8f16.Value), - !eq(SrcVT.Value, v16f16.Value), - !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v4f32.Value), - !eq(SrcVT.Value, v8f32.Value), - !eq(SrcVT.Value, v2f64.Value), - !eq(SrcVT.Value, v4f64.Value)); -} - -// XXX - do v2i16 instructions? class isIntType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, i8.Value), - !eq(SrcVT.Value, i16.Value), - !eq(SrcVT.Value, i32.Value), - !eq(SrcVT.Value, i64.Value), - !eq(SrcVT.Value, v4i16.Value), - !eq(SrcVT.Value, v8i16.Value), - !eq(SrcVT.Value, v16i16.Value), - !eq(SrcVT.Value, v2i32.Value), - !eq(SrcVT.Value, v4i32.Value), - !eq(SrcVT.Value, v8i32.Value)); + bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value)); } -class isPackedType<ValueType SrcVT> { - bit ret = !or(!eq(SrcVT.Value, v2i16.Value), - !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v4f16.Value), - !eq(SrcVT.Value, v2i32.Value), - !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v4i32.Value), - !eq(SrcVT.Value, v4f32.Value), - !eq(SrcVT.Value, v8i32.Value), - !eq(SrcVT.Value, v8f32.Value)); -} - - //===----------------------------------------------------------------------===// // PatFrags for global memory operations //===----------------------------------------------------------------------===// @@ -806,12 +788,9 @@ class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1); }]>; -def SIMM16bit : ImmLeaf <i32, - [{return isInt<16>(Imm);}] ->; - -def UIMM16bit : ImmLeaf <i32, - [{return isUInt<16>(Imm);}] +def SIMM16bit : TImmLeaf <i32, + [{return isInt<16>(Imm) || isUInt<16>(Imm);}], + as_i16timm >; def i64imm_32bit : ImmLeaf<i64, [{ @@ -885,8 +864,11 @@ def extract_swz : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8); }]>; -def set_glc : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); +def extract_cpol_set_glc : SDNodeXForm<timm, [{ + const uint32_t cpol = N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12); + return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8); }]>; //===----------------------------------------------------------------------===// @@ -993,7 +975,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { class SDWASrc<ValueType vt> : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; - string Type = !if(isFloatType<vt>.ret, "FP", "INT"); + string Type = !if(vt.isFP, "FP", "INT"); let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size; let DecoderMethod = "decodeSDWASrc"#vt.Size; let EncoderMethod = "getSDWASrcEncoding"; @@ -1241,17 +1223,20 @@ def FPVRegInputModsMatchClass : AsmOperandClass { let PredicateMethod = "isVRegWithInputMods"; } -def FPT16VRegInputModsMatchClass : AsmOperandClass { - let Name = "T16VRegWithFPInputMods"; +class FPT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass { + let Name = !if(IsFake16, "Fake16VRegWithFPInputMods", + "T16VRegWithFPInputMods"); let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isT16VRegWithInputMods"; + let PredicateMethod = "isT16VRegWithInputMods<" # + !if(IsFake16, "true", "false") # ">"; } def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } -def FPT16VRegInputMods : InputMods <FPT16VRegInputModsMatchClass> { +class FPT16VRegInputMods<bit IsFake16> + : InputMods <FPT16VRegInputModsMatchClass<IsFake16>> { let PrintMethod = "printOperandAndFPInputMods"; } @@ -1283,13 +1268,16 @@ def IntVRegInputModsMatchClass : AsmOperandClass { let PredicateMethod = "isVRegWithInputMods"; } -def IntT16VRegInputModsMatchClass : AsmOperandClass { - let Name = "T16VRegWithIntInputMods"; +class IntT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass { + let Name = !if(IsFake16, "Fake16VRegWithIntInputMods", + "T16VRegWithIntInputMods"); let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isT16VRegWithInputMods"; + let PredicateMethod = "isT16VRegWithInputMods<" # + !if(IsFake16, "true", "false") # ">"; } -def IntT16VRegInputMods : InputMods <IntT16VRegInputModsMatchClass> { +class IntT16VRegInputMods<bit IsFake16> + : InputMods <IntT16VRegInputModsMatchClass<IsFake16>> { let PrintMethod = "printOperandAndIntInputMods"; } @@ -1353,7 +1341,7 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; -def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">; +def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; @@ -1489,20 +1477,18 @@ class getSDWADstForVT<ValueType VT> { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> { - bit isFP = isFloatType<VT>.ret; - RegisterOperand ret = - !if(isFP, + !if(VT.isFP, !if(!eq(VT.Size, 64), VSrc_f64, - !if(!eq(VT.Value, f16.Value), + !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), !if(IsTrue16, !if(IsFake16, VSrcFake16_f16_Lo128, VSrcT_f16_Lo128), VSrc_f16 ), - !if(!eq(VT.Value, v2f16.Value), + !if(!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)), VSrc_v2f16, - !if(!eq(VT.Value, v4f16.Value), + !if(!or(!eq(VT.Value, v4f16.Value), !eq(VT.Value, v4bf16.Value)), AVSrc_64, VSrc_f32 ) @@ -1530,43 +1516,33 @@ class getSOPSrcForVT<ValueType VT> { } // Returns the vreg register class to use for source operand given VT -class getVregSrcForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, - !if(!eq(VT.Size, 96), VReg_96, - !if(!eq(VT.Size, 64), VReg_64, - !if(!eq(VT.Size, 48), VReg_64, - VGPR_32)))); -} - -class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> { - RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, - !if(!eq(VT.Size, 96), VReg_96, - !if(!eq(VT.Size, 64), VReg_64, - !if(!eq(VT.Size, 48), VReg_64, - !if(!eq(VT.Size, 16), - !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128), - VGPR_32))))); - - RegisterOperand op = !if (!and(!eq(VT.Size, 16), !not(IsFake16)), - VGPRSrc_16_Lo128, RegisterOperand<ret>); +class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> { + RegisterOperand ret = + !if (!eq(VT.Size, 128), RegisterOperand<VReg_128>, + !if (!eq(VT.Size, 96), RegisterOperand<VReg_96>, + !if (!eq(VT.Size, 64), RegisterOperand<VReg_64>, + !if (!eq(VT.Size, 48), RegisterOperand<VReg_64>, + !if (!eq(VT.Size, 16), + !if (IsTrue16, + !if (IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128), + RegisterOperand<VGPR_32>), + RegisterOperand<VGPR_32>))))); } class getSDWASrcForVT <ValueType VT> { - bit isFP = isFloatType<VT>.ret; RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); - RegisterOperand ret = !if(isFP, retFlt, retInt); + RegisterOperand ret = !if(VT.isFP, retFlt, retInt); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { - bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), VRegSrc_128, !if(!eq(VT.Size, 64), - !if(isFP, + !if(VT.isFP, !if(!eq(VT.Value, v2f32.Value), VSrc_v2f32, VSrc_f64), @@ -1575,12 +1551,12 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { VSrc_b64)), !if(!eq(VT.Value, i1.Value), SSrc_i1, - !if(isFP, - !if(!eq(VT.Value, f16.Value), + !if(VT.isFP, + !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), !if(IsTrue16, VSrcT_f16, VSrc_f16), - !if(!eq(VT.Value, v2f16.Value), + !if(!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)), VSrc_v2f16, - !if(!eq(VT.Value, v4f16.Value), + !if(!or(!eq(VT.Value, v4f16.Value), !eq(VT.Value, v4bf16.Value)), AVSrc_64, VSrc_f32 ) @@ -1601,12 +1577,11 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { // Src2 of VOP3 DPP instructions cannot be a literal class getVOP3DPPSrcForVT<ValueType VT> { - bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if (!eq(VT.Value, i1.Value), SSrc_i1, - !if (isFP, - !if (!eq(VT.Value, f16.Value), VCSrc_f16, - !if (!eq(VT.Value, v2f16.Value), VCSrc_v2f16, VCSrc_f32)), + !if (VT.isFP, + !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), VCSrc_f16, + !if (!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)), VCSrc_v2f16, VCSrc_f32)), !if (!eq(VT.Value, i16.Value), VCSrc_b16, !if (!eq(VT.Value, v2i16.Value), VCSrc_v2b16, VCSrc_b32)))); @@ -1615,64 +1590,64 @@ class getVOP3DPPSrcForVT<ValueType VT> { // Float or packed int class isModifierType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, f16.Value), + !eq(SrcVT.Value, bf16.Value), !eq(SrcVT.Value, f32.Value), !eq(SrcVT.Value, f64.Value), !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v2i16.Value), + !eq(SrcVT.Value, v2bf16.Value), !eq(SrcVT.Value, v2f32.Value), !eq(SrcVT.Value, v2i32.Value), !eq(SrcVT.Value, v4f16.Value), !eq(SrcVT.Value, v4i16.Value), + !eq(SrcVT.Value, v4bf16.Value), !eq(SrcVT.Value, v4f32.Value), !eq(SrcVT.Value, v4i32.Value), !eq(SrcVT.Value, v8f16.Value), !eq(SrcVT.Value, v8i16.Value), + !eq(SrcVT.Value, v8bf16.Value), !eq(SrcVT.Value, v8f32.Value), !eq(SrcVT.Value, v8i32.Value), !eq(SrcVT.Value, v16f16.Value), - !eq(SrcVT.Value, v16i16.Value)); + !eq(SrcVT.Value, v16i16.Value), + !eq(SrcVT.Value, v16bf16.Value)); } // Return type of input modifiers operand for specified input operand class getSrcMod <ValueType VT, bit IsTrue16 = 0> { - bit isFP = isFloatType<VT>.ret; - bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), - !if(isFP, FP64InputMods, Int64InputMods), + !if(VT.isFP, FP64InputMods, Int64InputMods), !if(!eq(VT.Size, 16), - !if(isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods), - !if(IsTrue16, IntT16InputMods, IntOpSelMods)), - !if(isFP, FP32InputMods, Int32InputMods))); + !if(VT.isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods), + !if(IsTrue16, IntT16InputMods, IntOpSelMods)), + !if(VT.isFP, FP32InputMods, Int32InputMods))); } class getOpSelMod <ValueType VT> { - Operand ret = !if(!eq(VT.Value, f16.Value), FP16InputMods, IntOpSelMods); + Operand ret = !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + FP16InputMods, IntOpSelMods); } // Return type of input modifiers operand specified input operand for DPP class getSrcModDPP <ValueType VT> { - bit isFP = isFloatType<VT>.ret; - Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); + Operand ret = !if(VT.isFP, FPVRegInputMods, IntVRegInputMods); } -class getSrcModDPP_t16 <ValueType VT> { - bit isFP = isFloatType<VT>.ret; +class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> { Operand ret = - !if (isFP, - !if (!eq(VT.Value, f16.Value), FPT16VRegInputMods, - FPVRegInputMods), - !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods, - IntVRegInputMods)); + !if (VT.isFP, + !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + FPT16VRegInputMods<IsFake16>, FPVRegInputMods), + !if (!eq(VT.Value, i16.Value), + IntT16VRegInputMods<IsFake16>, IntVRegInputMods)); } // Return type of input modifiers operand for specified input operand for DPP class getSrcModVOP3DPP <ValueType VT> { - bit isFP = isFloatType<VT>.ret; - bit isPacked = isPackedType<VT>.ret; Operand ret = - !if (isFP, - !if (!eq(VT.Value, f16.Value), FP16VCSrcInputMods, - FP32VCSrcInputMods), + !if (VT.isFP, + !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), + FP16VCSrcInputMods, FP32VCSrcInputMods), Int32VCSrcInputMods); } @@ -1681,7 +1656,8 @@ class getSrcModSDWA <ValueType VT> { Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods, !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods, !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods, - Int32SDWAInputMods))); + !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods, + Int32SDWAInputMods)))); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1806,10 +1782,9 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC, Src0Mod, Src1Mod, Src2Mod, /*HasOpSel=*/1>.ret; } -class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> { - +class getInsDPPBase <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> { dag ret = !if(!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) (ins ), @@ -1849,8 +1824,8 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass ); } -class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, +class getInsDPP <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers, Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, @@ -1858,17 +1833,17 @@ class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1 bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); } -class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { +class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, (ins FI:$fi)); } -class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { +class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> { dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret, (ins dpp8:$dpp8, FI:$fi)); @@ -2273,13 +2248,13 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field RegisterOperand DstRCVOP3DPP = DstRC64; field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT, IsTrue16>.ret; - field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>; + field RegisterOperand Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; - field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret; - field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; - field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret; + field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret; + field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret; + field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret; field RegisterOperand Src0VOP3DPP = VGPRSrc_32; field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret; field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret; @@ -2313,9 +2288,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value); field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value); - field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret; - field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret; - field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret; + field bit HasSrc0FloatMods = Src0VT.isFP; + field bit HasSrc1FloatMods = Src1VT.isFP; + field bit HasSrc2FloatMods = Src2VT.isFP; field bit HasSrc0IntMods = isIntType<Src0VT>.ret; field bit HasSrc1IntMods = isIntType<Src1VT>.ret; @@ -2323,16 +2298,16 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasClamp = !or(isModifierType<Src0VT>.ret, EnableClamp); field bit HasSDWAClamp = EmitDst; - field bit HasFPClamp = !and(isFloatType<DstVT>.ret, HasClamp); - field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp); + field bit HasFPClamp = !and(DstVT.isFP, HasClamp); + field bit HasIntClamp = !if(DstVT.isFP, 0, HasClamp); field bit HasClampLo = HasClamp; - field bit HasClampHi = !and(isPackedType<DstVT>.ret, HasClamp); + field bit HasClampHi = !and(DstVT.isVector, HasClamp); field bit HasHigh = 0; - field bit IsPacked = isPackedType<Src0VT>.ret; + field bit IsPacked = Src0VT.isVector; field bit HasOpSel = IsPacked; - field bit HasOMod = !if(IsVOP3P, 0, isFloatType<DstVT>.ret); - field bit HasSDWAOMod = isFloatType<DstVT>.ret; + field bit HasOMod = !if(IsVOP3P, 0, DstVT.isFP); + field bit HasSDWAOMod = DstVT.isFP; field bit HasModifiers = !or(isModifierType<Src0VT>.ret, isModifierType<Src1VT>.ret, @@ -2465,13 +2440,13 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> { let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret; let DstRC64 = getVALUDstForVT<DstVT>.ret; let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; - let Src1RC32 = getVregSrcForVT_t16<Src1VT, 0 /*IsFake16*/>.op; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; - let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; - let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; - let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; + let Src1RC32 = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; + let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret; + let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret; + let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret; let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret; @@ -2487,10 +2462,10 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> { // Most DstVT are 16-bit, but not all let DstRC = getVALUDstForVT_fake16<DstVT>.ret; let DstRC64 = getVALUDstForVT<DstVT>.ret; - let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index b4bd46d33c1f..788e3162fb37 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat < >; def : ClampPat<V_MAX_F32_e64, f32>; +let SubtargetPredicate = isNotGFX12Plus in def : ClampPat<V_MAX_F64_e64, f64>; +let SubtargetPredicate = isGFX12Plus in +def : ClampPat<V_MAX_NUM_F64_e64, f64>; let SubtargetPredicate = NotHasTrue16BitInsts in def : ClampPat<V_MAX_F16_e64, f16>; let SubtargetPredicate = UseRealTrue16Insts in @@ -2990,10 +2993,12 @@ def : GCNPat< } // TODO: Handle fneg like other types. +let SubtargetPredicate = isNotGFX12Plus in { def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src) >; +} } // End AddedComplexity = -5 multiclass SelectCanonicalizeAsMax< @@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax< def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { - let OtherPredicates = f64_preds; + let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]); + } + + def : GCNPat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]); } def : GCNPat< @@ -3856,11 +3867,13 @@ def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; @@ -3877,7 +3890,8 @@ def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { // Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as // a workaround for the intrinsic being defined as readnone, but // really needs a memory operand. -def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { + +class SBufferLoadInstruction : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); let hasSideEffects = 0; @@ -3885,6 +3899,12 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { let mayStore = 0; } +def G_AMDGPU_S_BUFFER_LOAD : SBufferLoadInstruction; +def G_AMDGPU_S_BUFFER_LOAD_SBYTE : SBufferLoadInstruction; +def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction; +def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction; +def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction; + def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src0, type0:$src1); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 0ba7792ac436..4b13825040eb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -332,7 +332,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { } bool MadeChange = false; - bool NewReservedRegs = false; bool SpilledToVirtVGPRLanes = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be @@ -369,8 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // regalloc aware CFI generation to insert new CFIs along with the // intermediate spills is implemented. There is no such support // currently exist in the LLVM compiler. - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { - NewReservedRegs = true; + if (FuncInfo->allocateSGPRSpillToVGPRLane( + MF, FI, /*SpillToPhysVGPRLane=*/true)) { bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS, true); if (!Spilled) @@ -442,12 +441,5 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any physical VGPRs added for SGPR - // spills. - if (NewReservedRegs) { - for (Register Reg : FuncInfo->getWWMReservedRegs()) - MRI.reserveReg(Reg, TRI); - } - return MadeChange; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index e8142244b7db..b94d143a75e5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } +void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( + MachineFunction &MF) { + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) { + Register Reg = SpillPhysVGPRs[I]; + Register NewReg = + TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (!NewReg || NewReg >= Reg) + break; + + MRI.replaceRegWith(Reg, NewReg); + + // Update various tables with the new VGPR. + SpillPhysVGPRs[I] = NewReg; + WWMReservedRegs.remove(Reg); + WWMReservedRegs.insert(NewReg); + WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg])); + WWMSpills.erase(Reg); + + for (MachineBasicBlock &MBB : MF) { + MBB.removeLiveIn(Reg); + MBB.sortUniqueLiveIns(); + } + } +} + bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( } bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( - MachineFunction &MF, int FI, unsigned LaneIndex) { + MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + // Find the highest available register if called before RA to ensure the + // lowest registers are available for allocation. The LaneVGPR, in that + // case, will be shifted back to the lowest range after VGPR allocation. + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF, + !IsPrologEpilog); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. @@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( return true; } -bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, - int FI, - bool IsPrologEpilog) { +bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane( + MachineFunction &MF, int FI, bool SpillToPhysVGPRLane, + bool IsPrologEpilog) { std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = - IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] - : SGPRSpillsToVirtualVGPRLanes[FI]; + SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, assert(ST.getRegisterInfo()->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); - unsigned &NumSpillLanes = - IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; + unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes + : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = IsPrologEpilog - ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + bool Allocated = SpillToPhysVGPRLane + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex, + IsPrologEpilog) : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index dc63ae44c528..9ff66a094f99 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -548,7 +548,8 @@ private: bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, unsigned LaneIndex); bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + unsigned LaneIndex, + bool IsPrologEpilog); public: Register getVGPRForAGPRCopy() const { @@ -588,6 +589,7 @@ public: } ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; } + const WWMSpillsMap &getWWMSpills() const { return WWMSpills; } const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; } @@ -702,7 +704,12 @@ public: I->second.IsDead = true; } + // To bring the Physical VGPRs in the highest range allocated for CSR SGPR + // spilling into the lowest available range. + void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF); + bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, + bool SpillToPhysVGPRLane = false, bool IsPrologEpilog = false); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); @@ -1041,22 +1048,6 @@ public: return WavesPerEU.second; } - /// \returns SGPR used for \p Dim's work group ID. - Register getWorkGroupIDSGPR(unsigned Dim) const { - switch (Dim) { - case 0: - assert(hasWorkGroupIDX()); - return ArgInfo.WorkGroupIDX.getRegister(); - case 1: - assert(hasWorkGroupIDY()); - return ArgInfo.WorkGroupIDY.getRegister(); - case 2: - assert(hasWorkGroupIDZ()); - return ArgInfo.WorkGroupIDZ.getRegister(); - } - llvm_unreachable("unexpected dimension"); - } - const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM) { return &GWSResourcePSV; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6d749ad1ad24..84b9330ef963 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -579,11 +579,30 @@ public: }; class SIGfx12CacheControl : public SIGfx11CacheControl { +protected: + // Sets TH policy to \p Value if CPol operand is present in instruction \p MI. + // \returns Returns true if \p MI is modified, false otherwise. + bool setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + // Sets Scope policy to \p Value if CPol operand is present in instruction \p + // MI. \returns Returns true if \p MI is modified, false otherwise. + bool setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const; + public: SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, Position Pos) const override; + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; }; class SIMemoryLegalizer final : public MachineFunctionPass { @@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewTH = Value & AMDGPU::CPol::TH; + if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, + AMDGPU::CPol::CPol Value) const { + MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); + if (!CPol) + return false; + + uint64_t NewScope = Value & AMDGPU::CPol::SCOPE; + if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) { + CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope); + return true; + } + + return false; +} + +bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + bool LOADCnt = false; + bool DSCnt = false; + bool STORECnt = false; + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) != + SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!ST.isCuModeEnabled()) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + LOADCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + STORECnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is + // not needed as LDS operations for all waves are executed in a total + // global ordering as observed by all waves. Required if also + // synchronizing with global/GDS memory as LDS operations could be + // reordered with respect to later global/GDS memory operations of the + // same wave. + DSCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (LOADCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); + Changed = true; + } + + if (STORECnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0); + Changed = true; + } + + if (DSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, @@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return true; } +bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write instructions. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + } + + if (IsNonTemporal) { + // Set non-temporal hint for all cache levels. + Changed |= setTH(MI, AMDGPU::CPol::TH_NT); + } + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index be395d53c34e..e62ad026dc5c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -285,7 +285,7 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, 1; unsigned Offset = (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; - unsigned Mask = ((1 << Width) - 1) << Offset; + unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset; // If an InsertionPoint is set we will insert a setreg there. if (InsertionPoint) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a93cf5cad411..a2cacb5cbaa3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1657,8 +1657,12 @@ void SIRegisterInfo::buildSpillLoadStore( } else { MIB.addReg(SOffset, SOffsetRegState); } - MIB.addImm(Offset + RegOffset) - .addImm(0); // cpol + + MIB.addImm(Offset + RegOffset); + + bool LastUse = MMO->getFlags() & MOLastUse; + MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol + if (!IsFlat) MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); @@ -2241,6 +2245,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS->isRegUsed(AMDGPU::SCC)); } + buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td index fc29ce8d71f2..9a27d22d585e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -810,7 +810,7 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; // Scalar Memory Patterns //===----------------------------------------------------------------------===// -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> { +class SMRDLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{ return isUniformLoad(N);}]> { let GISelPredicateCode = [{ if (!MI.hasOneMemOperand()) return false; @@ -827,6 +827,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL }]; } +def smrd_load : SMRDLoadPat<load>; +def smrd_extloadi8 : SMRDLoadPat<extloadi8>; +def smrd_zextloadi8 : SMRDLoadPat<zextloadi8>; +def smrd_sextloadi8 : SMRDLoadPat<sextloadi8>; +def smrd_extloadi16 : SMRDLoadPat<extloadi16>; +def smrd_zextloadi16 : SMRDLoadPat<zextloadi16>; +def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>; + def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), (prefetch node:$ptr, node:$rw, node:$loc, node:$type), [{ return !N->getOperand(1)->isDivergent();}]> { @@ -923,11 +931,78 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> { } } +multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, ValueType vt> { + // 1. IMM offset + def : GCNPat < + (node (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{ + let OtherPredicates = [isGFX12Plus]; + } + + // 2. SGPR offset + def : GCNPat < + (node (SMRDSgpr i64:$sbase, i32:$soffset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{ + let OtherPredicates = [isGFX12Plus]; + } + + // 3. SGPR+IMM offset + def : GCNPat < + (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{ + let OtherPredicates = [isGFX12Plus]; + } + + // 4. No offset + def : GCNPat < + (vt (node (i64 SReg_64:$sbase))), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{ + let OtherPredicates = [isGFX12Plus]; + } +} + +multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> { + + // 1. Offset as an immediate + def : GCNPat < + (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), + (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX12Plus]; + } + + // 2. Offset as an 32-bit SGPR + def : GCNPat < + (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy), + (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX12Plus]; + } + + // 3. Offset as an 32-bit SGPR + immediate + def : GCNPat < + (name v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset), + timm:$cachepolicy), + (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset, + (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX12Plus]; + } +} + // Global and constant loads can be selected to either MUBUF or SMRD // instructions, but SMRD instructions are faster so we want the instruction // selector to prefer those. let AddedComplexity = 100 in { +defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_extloadi8, i32>; +defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_zextloadi8, i32>; +defm : ScalarLoadWithExtensionPat <"S_LOAD_I8", smrd_sextloadi8, i32>; +defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_extloadi16, i32>; +defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_zextloadi16, i32>; +defm : ScalarLoadWithExtensionPat <"S_LOAD_I16", smrd_sextloadi16, i32>; +defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_byte, "S_BUFFER_LOAD_I8">; +defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">; +defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">; +defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">; + foreach vt = Reg32Types.types in { defm : SMRD_Pattern <"S_LOAD_DWORD", vt>; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td index 46fa3d57a21c..ae5ef0541929 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -292,8 +292,11 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +// PSEUDO includes a workaround for a hardware anomaly where some ASICs +// zero-extend the result from 48 bits instead of sign-extending. let isReMaterializable = 1 in -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", +def S_GETPC_B64_pseudo : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; @@ -502,8 +505,6 @@ def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), (ins SplitBarrier:$src0), "$src0", []>{ let SchedRW = [WriteBarrier]; let isConvergent = 1; - - } } // End has_sdst = 0 @@ -1124,7 +1125,7 @@ class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo < pattern>; def S_SETREG_B32 : S_SETREG_B32_Pseudo < - [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> { + [(int_amdgcn_s_setreg (i32 SIMM16bit:$simm16), i32:$sdst)]> { // Use custom inserter to optimize some cases to // S_DENORM_MODE/S_ROUND_MODE/S_SETREG_B32_mode. let usesCustomInserter = 1; @@ -1597,6 +1598,13 @@ def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16", // that doesn't access memory. def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">; def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">; +let SubtargetPredicate = isGFX12Plus in { + def S_WAIT_LOADCNT_soft : SOPP_Pseudo <"s_soft_wait_loadcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_STORECNT_soft : SOPP_Pseudo <"s_soft_wait_storecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">; +} def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; @@ -1712,23 +1720,30 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in { let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in { def S_WAIT_LOADCNT : - SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_loadcnt timm:$simm16)]>; def S_WAIT_LOADCNT_DSCNT : SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">; def S_WAIT_STORECNT : - SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_storecnt timm:$simm16)]>; def S_WAIT_STORECNT_DSCNT : SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">; def S_WAIT_SAMPLECNT : - SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_samplecnt timm:$simm16)]>; def S_WAIT_BVHCNT : - SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_bvhcnt timm:$simm16)]>; def S_WAIT_EXPCNT : - SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_expcnt timm:$simm16)]>; def S_WAIT_DSCNT : - SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_dscnt timm:$simm16)]>; def S_WAIT_KMCNT : - SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_wait_kmcnt timm:$simm16)]>; } // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 //===----------------------------------------------------------------------===// @@ -1768,10 +1783,10 @@ def : GCNPat< (S_SEXT_I32_I16 $src) >; -def : GCNPat < - (int_amdgcn_s_wait_event_export_ready), - (S_WAIT_EVENT (i16 0)) ->; +let SubtargetPredicate = isNotGFX12Plus in + def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>; +let SubtargetPredicate = isGFX12Plus in + def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 1))>; // The first 10 bits of the mode register are the core FP mode on all // subtargets. @@ -2610,7 +2625,7 @@ multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> : defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>; defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>; defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>; -defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11_gfx12<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">; +defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">; defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>; defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>; defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 26ba2575ff34..0bf9452d822e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -31,10 +31,11 @@ #define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" -static llvm::cl::opt<unsigned> - AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden, - llvm::cl::desc("AMDHSA Code Object Version"), - llvm::cl::init(4)); +static llvm::cl::opt<unsigned> DefaultAMDHSACodeObjectVersion( + "amdhsa-code-object-version", llvm::cl::Hidden, + llvm::cl::init(llvm::AMDGPU::AMDHSA_COV5), + llvm::cl::desc("Set default AMDHSA Code Object Version (module flag " + "or asm directive still take priority if present)")); namespace { @@ -94,6 +95,44 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) { return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0; } +/// \returns Loadcnt bit width +unsigned getLoadcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Samplecnt bit width. +unsigned getSamplecntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Bvhcnt bit width. +unsigned getBvhcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 3 : 0; +} + +/// \returns Dscnt bit width. +unsigned getDscntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 6 : 0; +} + +/// \returns Dscnt bit shift in combined S_WAIT instructions. +unsigned getDscntBitShift(unsigned VersionMajor) { return 0; } + +/// \returns Storecnt or Vscnt bit width, depending on VersionMajor. +unsigned getStorecntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 10 ? 6 : 0; +} + +/// \returns Kmcnt bit width. +unsigned getKmcntBitWidth(unsigned VersionMajor) { + return VersionMajor >= 12 ? 5 : 0; +} + +/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions. +unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) { + return VersionMajor >= 12 ? 8 : 0; +} + /// \returns VmVsrc bit width inline unsigned getVmVsrcBitWidth() { return 3; } @@ -123,45 +162,32 @@ bool isHsaAbi(const MCSubtargetInfo &STI) { return STI.getTargetTriple().getOS() == Triple::AMDHSA; } -std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { - if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA) - return std::nullopt; - - switch (AmdhsaCodeObjectVersion) { - case 4: - return ELF::ELFABIVERSION_AMDGPU_HSA_V4; - case 5: - return ELF::ELFABIVERSION_AMDGPU_HSA_V5; - default: - report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") + - Twine(AmdhsaCodeObjectVersion)); +unsigned getAMDHSACodeObjectVersion(const Module &M) { + if (auto Ver = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("amdgpu_code_object_version"))) { + return (unsigned)Ver->getZExtValue() / 100; } -} -bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) - return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4; - return false; + return getDefaultAMDHSACodeObjectVersion(); } -bool isHsaAbiVersion5(const MCSubtargetInfo *STI) { - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) - return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5; - return false; +unsigned getDefaultAMDHSACodeObjectVersion() { + return DefaultAMDHSACodeObjectVersion; } -unsigned getAmdhsaCodeObjectVersion() { - return AmdhsaCodeObjectVersion; -} +uint8_t getELFABIVersion(const Triple &T, unsigned CodeObjectVersion) { + if (T.getOS() != Triple::AMDHSA) + return 0; -unsigned getCodeObjectVersion(const Module &M) { - if (auto Ver = mdconst::extract_or_null<ConstantInt>( - M.getModuleFlag("amdgpu_code_object_version"))) { - return (unsigned)Ver->getZExtValue() / 100; + switch (CodeObjectVersion) { + case 4: + return ELF::ELFABIVERSION_AMDGPU_HSA_V4; + case 5: + return ELF::ELFABIVERSION_AMDGPU_HSA_V5; + default: + report_fatal_error("Unsupported AMDHSA Code Object Version " + + Twine(CodeObjectVersion)); } - - // Default code object version. - return AMDHSA_COV4; } unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { @@ -667,7 +693,7 @@ namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) : STI(STI), XnackSetting(TargetIDSetting::Any), - SramEccSetting(TargetIDSetting::Any), CodeObjectVersion(0) { + SramEccSetting(TargetIDSetting::Any) { if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) XnackSetting = TargetIDSetting::Unsupported; if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) @@ -779,23 +805,16 @@ std::string AMDGPUTargetID::toString() const { std::string Features; if (STI.getTargetTriple().getOS() == Triple::AMDHSA) { - switch (CodeObjectVersion) { - case AMDGPU::AMDHSA_COV4: - case AMDGPU::AMDHSA_COV5: - // sramecc. - if (getSramEccSetting() == TargetIDSetting::Off) - Features += ":sramecc-"; - else if (getSramEccSetting() == TargetIDSetting::On) - Features += ":sramecc+"; - // xnack. - if (getXnackSetting() == TargetIDSetting::Off) - Features += ":xnack-"; - else if (getXnackSetting() == TargetIDSetting::On) - Features += ":xnack+"; - break; - default: - break; - } + // sramecc. + if (getSramEccSetting() == TargetIDSetting::Off) + Features += ":sramecc-"; + else if (getSramEccSetting() == TargetIDSetting::On) + Features += ":sramecc+"; + // xnack. + if (getXnackSetting() == TargetIDSetting::Off) + Features += ":xnack-"; + else if (getXnackSetting() == TargetIDSetting::On) + Features += ":xnack+"; } StreamRep << Processor << Features; @@ -1229,6 +1248,18 @@ unsigned getVmcntBitMask(const IsaVersion &Version) { 1; } +unsigned getLoadcntBitMask(const IsaVersion &Version) { + return (1 << getLoadcntBitWidth(Version.Major)) - 1; +} + +unsigned getSamplecntBitMask(const IsaVersion &Version) { + return (1 << getSamplecntBitWidth(Version.Major)) - 1; +} + +unsigned getBvhcntBitMask(const IsaVersion &Version) { + return (1 << getBvhcntBitWidth(Version.Major)) - 1; +} + unsigned getExpcntBitMask(const IsaVersion &Version) { return (1 << getExpcntBitWidth(Version.Major)) - 1; } @@ -1237,6 +1268,18 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) { return (1 << getLgkmcntBitWidth(Version.Major)) - 1; } +unsigned getDscntBitMask(const IsaVersion &Version) { + return (1 << getDscntBitWidth(Version.Major)) - 1; +} + +unsigned getKmcntBitMask(const IsaVersion &Version) { + return (1 << getKmcntBitWidth(Version.Major)) - 1; +} + +unsigned getStorecntBitMask(const IsaVersion &Version) { + return (1 << getStorecntBitWidth(Version.Major)) - 1; +} + unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), getVmcntBitWidthLo(Version.Major)); @@ -1276,9 +1319,9 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { Waitcnt Decoded; - Decoded.VmCnt = decodeVmcnt(Version, Encoded); + Decoded.LoadCnt = decodeVmcnt(Version, Encoded); Decoded.ExpCnt = decodeExpcnt(Version, Encoded); - Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded); + Decoded.DsCnt = decodeLgkmcnt(Version, Encoded); return Decoded; } @@ -1313,7 +1356,85 @@ unsigned encodeWaitcnt(const IsaVersion &Version, } unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { - return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); + return encodeWaitcnt(Version, Decoded.LoadCnt, Decoded.ExpCnt, Decoded.DsCnt); +} + +static unsigned getCombinedCountBitMask(const IsaVersion &Version, + bool IsStore) { + unsigned Dscnt = getBitMask(getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + if (IsStore) { + unsigned Storecnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); + return Dscnt | Storecnt; + } else { + unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); + return Dscnt | Loadcnt; + } +} + +Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt) { + Waitcnt Decoded; + Decoded.LoadCnt = + unpackBits(LoadcntDscnt, getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); + Decoded.DsCnt = unpackBits(LoadcntDscnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + return Decoded; +} + +Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt) { + Waitcnt Decoded; + Decoded.StoreCnt = + unpackBits(StorecntDscnt, getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); + Decoded.DsCnt = unpackBits(StorecntDscnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); + return Decoded; +} + +static unsigned encodeLoadcnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Loadcnt) { + return packBits(Loadcnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major), + getLoadcntBitWidth(Version.Major)); +} + +static unsigned encodeStorecnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Storecnt) { + return packBits(Storecnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major), + getStorecntBitWidth(Version.Major)); +} + +static unsigned encodeDscnt(const IsaVersion &Version, unsigned Waitcnt, + unsigned Dscnt) { + return packBits(Dscnt, Waitcnt, getDscntBitShift(Version.Major), + getDscntBitWidth(Version.Major)); +} + +static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, + unsigned Dscnt) { + unsigned Waitcnt = getCombinedCountBitMask(Version, false); + Waitcnt = encodeLoadcnt(Version, Waitcnt, Loadcnt); + Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt); + return Waitcnt; +} + +unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded) { + return encodeLoadcntDscnt(Version, Decoded.LoadCnt, Decoded.DsCnt); +} + +static unsigned encodeStorecntDscnt(const IsaVersion &Version, + unsigned Storecnt, unsigned Dscnt) { + unsigned Waitcnt = getCombinedCountBitMask(Version, true); + Waitcnt = encodeStorecnt(Version, Waitcnt, Storecnt); + Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt); + return Waitcnt; +} + +unsigned encodeStorecntDscnt(const IsaVersion &Version, + const Waitcnt &Decoded) { + return encodeStorecntDscnt(Version, Decoded.StoreCnt, Decoded.DsCnt); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 50c741760d71..d3f55c792017 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -46,14 +46,18 @@ enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 }; /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI); -/// \returns HSA OS ABI Version identification. -std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 4, -/// false otherwise. -bool isHsaAbiVersion4(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 5, -/// false otherwise. -bool isHsaAbiVersion5(const MCSubtargetInfo *STI); + +/// \returns Code object version from the IR module flag. +unsigned getAMDHSACodeObjectVersion(const Module &M); + +/// \returns The default HSA code object version. This should only be used when +/// we lack a more accurate CodeObjectVersion value (e.g. from the IR module +/// flag or a .amdhsa_code_object_version directive) +unsigned getDefaultAMDHSACodeObjectVersion(); + +/// \returns ABIVersion suitable for use in ELF's e_ident[ABIVERSION]. \param +/// CodeObjectVersion is a value returned by getAMDHSACodeObjectVersion(). +uint8_t getELFABIVersion(const Triple &OS, unsigned CodeObjectVersion); /// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV); @@ -64,12 +68,6 @@ unsigned getHostcallImplicitArgPosition(unsigned COV); unsigned getDefaultQueueImplicitArgPosition(unsigned COV); unsigned getCompletionActionImplicitArgPosition(unsigned COV); -/// \returns Code object version. -unsigned getAmdhsaCodeObjectVersion(); - -/// \returns Code object version. -unsigned getCodeObjectVersion(const Module &M); - struct GcnBufferFormatInfo { unsigned Format; unsigned BitsPerComp; @@ -114,7 +112,6 @@ private: const MCSubtargetInfo &STI; TargetIDSetting XnackSetting; TargetIDSetting SramEccSetting; - unsigned CodeObjectVersion; public: explicit AMDGPUTargetID(const MCSubtargetInfo &STI); @@ -144,10 +141,6 @@ public: return XnackSetting; } - void setCodeObjectVersion(unsigned COV) { - CodeObjectVersion = COV; - } - /// Sets xnack setting to \p NewXnackSetting. void setXnackSetting(TargetIDSetting NewXnackSetting) { XnackSetting = NewXnackSetting; @@ -837,39 +830,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name, /// Large values (including the maximum possible integer) can be used to /// represent "don't care" waits. struct Waitcnt { - unsigned VmCnt = ~0u; + unsigned LoadCnt = ~0u; // Corresponds to Vmcnt prior to gfx12. unsigned ExpCnt = ~0u; - unsigned LgkmCnt = ~0u; - unsigned VsCnt = ~0u; + unsigned DsCnt = ~0u; // Corresponds to LGKMcnt prior to gfx12. + unsigned StoreCnt = ~0u; // Corresponds to VScnt on gfx10/gfx11. + unsigned SampleCnt = ~0u; // gfx12+ only. + unsigned BvhCnt = ~0u; // gfx12+ only. + unsigned KmCnt = ~0u; // gfx12+ only. Waitcnt() = default; + // Pre-gfx12 constructor. Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) - : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} - - static Waitcnt allZero(bool HasVscnt) { - return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u); + : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt), + SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {} + + // gfx12+ constructor. + Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt, + unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt) + : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt), + SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {} + + static Waitcnt allZero(bool Extended, bool HasStorecnt) { + return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0) + : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u); } - static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); } - bool hasWait() const { - return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; + static Waitcnt allZeroExceptVsCnt(bool Extended) { + return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u); } - bool hasWaitExceptVsCnt() const { - return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u; - } + bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); } - bool hasWaitVsCnt() const { - return VsCnt != ~0u; + bool hasWaitExceptStoreCnt() const { + return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u || + SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u; } + bool hasWaitStoreCnt() const { return StoreCnt != ~0u; } + Waitcnt combined(const Waitcnt &Other) const { - return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), - std::min(LgkmCnt, Other.LgkmCnt), - std::min(VsCnt, Other.VsCnt)); + // Does the right thing provided self and Other are either both pre-gfx12 + // or both gfx12+. + return Waitcnt( + std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt), + std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt), + std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt), + std::min(KmCnt, Other.KmCnt)); } }; +// The following methods are only meaningful on targets that support +// S_WAITCNT. + /// \returns Vmcnt bit mask for given isa \p Version. unsigned getVmcntBitMask(const IsaVersion &Version); @@ -893,17 +905,19 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and -/// \p Lgkmcnt respectively. +/// \p Lgkmcnt respectively. Should not be used on gfx12+, the instruction +/// which needs it is deprecated /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: /// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9) /// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10) -/// \p Vmcnt = \p Waitcnt[15:10] (gfx11+) +/// \p Vmcnt = \p Waitcnt[15:10] (gfx11) /// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11) -/// \p Expcnt = \p Waitcnt[2:0] (gfx11+) +/// \p Expcnt = \p Waitcnt[2:0] (gfx11) /// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10) /// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10) -/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11+) +/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11) +/// void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); @@ -922,26 +936,78 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt); /// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa -/// \p Version. +/// \p Version. Should not be used on gfx12+, the instruction which needs +/// it is deprecated /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: /// Waitcnt[2:0] = \p Expcnt (gfx11+) /// Waitcnt[3:0] = \p Vmcnt (pre-gfx9) /// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9,10) /// Waitcnt[6:4] = \p Expcnt (pre-gfx11) -/// Waitcnt[9:4] = \p Lgkmcnt (gfx11+) +/// Waitcnt[9:4] = \p Lgkmcnt (gfx11) /// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10) /// Waitcnt[13:8] = \p Lgkmcnt (gfx10) -/// Waitcnt[15:10] = \p Vmcnt (gfx11+) +/// Waitcnt[15:10] = \p Vmcnt (gfx11) /// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9,10) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. +/// unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); +// The following methods are only meaningful on targets that support +// S_WAIT_*CNT, introduced with gfx12. + +/// \returns Loadcnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support LOADcnt +unsigned getLoadcntBitMask(const IsaVersion &Version); + +/// \returns Samplecnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support SAMPLEcnt +unsigned getSamplecntBitMask(const IsaVersion &Version); + +/// \returns Bvhcnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support BVHcnt +unsigned getBvhcntBitMask(const IsaVersion &Version); + +/// \returns Dscnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support DScnt +unsigned getDscntBitMask(const IsaVersion &Version); + +/// \returns Dscnt bit mask for given isa \p Version. +/// Returns 0 for versions that do not support KMcnt +unsigned getKmcntBitMask(const IsaVersion &Version); + +/// \return STOREcnt or VScnt bit mask for given isa \p Version. +/// returns 0 for versions that do not support STOREcnt or VScnt. +/// STOREcnt and VScnt are the same counter, the name used +/// depends on the ISA version. +unsigned getStorecntBitMask(const IsaVersion &Version); + +// The following are only meaningful on targets that support +// S_WAIT_LOADCNT_DSCNT and S_WAIT_STORECNT_DSCNT. + +/// \returns Decoded Waitcnt structure from given \p LoadcntDscnt for given +/// isa \p Version. +Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt); + +/// \returns Decoded Waitcnt structure from given \p StorecntDscnt for given +/// isa \p Version. +Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt); + +/// \returns \p Loadcnt and \p Dscnt components of \p Decoded encoded as an +/// immediate that can be used with S_WAIT_LOADCNT_DSCNT for given isa +/// \p Version. +unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); + +/// \returns \p Storecnt and \p Dscnt components of \p Decoded encoded as an +/// immediate that can be used with S_WAIT_STORECNT_DSCNT for given isa +/// \p Version. +unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded); + namespace Hwreg { LLVM_READONLY diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 99960c94e598..95a1d8696347 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -48,7 +48,7 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On let mayStore = 0; let hasSideEffects = 0; - let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); let mayRaiseFPException = ReadsModeReg; @@ -585,7 +585,7 @@ class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> { def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 <f32>; def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>; -let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, +let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>; defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>; @@ -705,7 +705,6 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = let DecoderNamespace = "DPP"#Gen.DecoderNamespace; } - class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : VOP_DPP8<ps.OpName, p> { let hasSideEffects = ps.hasSideEffects; @@ -881,6 +880,7 @@ defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16" defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">; +defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; @@ -1357,7 +1357,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; -let OtherPredicates = [HasFP8Insts] in { +let OtherPredicates = [HasFP8ConversionInsts] in { defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>; defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 48d4e259bc1c..27eec64f59a6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -69,7 +69,7 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf let mayStore = 0; let hasSideEffects = 0; - let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); let mayRaiseFPException = ReadsModeReg; @@ -418,15 +418,11 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> { } def VOP_MADMK_F32 : VOP_MADMK <f32>; -class getRegisterOperandForVT<ValueType VT> { - RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>; -} - // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2); - let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3, + let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, @@ -489,21 +485,21 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> { let DstRC = VOPDstOperand<VGPR_32_Lo128>; let DstRC64 = VOPDstOperand<VGPR_32>; let Src1RC32 = VGPRSrc_32_Lo128; - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT_t16<Src2VT>.ret:$src2); - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2); + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument + getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument + getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument dpp8:$dpp8, FI:$fi); let Src2Mod = FP32InputMods; // dummy unused modifiers let Src2RC64 = VGPRSrc_32; // stub argument @@ -535,7 +531,7 @@ def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> { let Src0Mod = Int32InputMods; let Src1Mod = Int32InputMods; - let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, + let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3 /*NumSrcArgs*/, HasClamp, 1 /*HasModifiers*/, 1 /*HasSrc2Mods*/, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; @@ -898,8 +894,8 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> { } def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> { let Src1RC32 = RegisterOperand<VGPR_32_Lo128>; - let Src1DPP = VGPR_32_Lo128; - let Src1ModDPP = IntT16VRegInputMods; + let Src1DPP = RegisterOperand<VGPR_32_Lo128>; + let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>; } let isReMaterializable = 1 in { @@ -2512,6 +2508,7 @@ defm V_FMAAK_F32 : VOP2_Real_MADK_gfx940 <0x18>; } multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : Base_VOP2_Real_e32e64_vi<op> { + let SubtargetPredicate = isGFX9Only in def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } @@ -2520,22 +2517,28 @@ multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> : VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>; -let SubtargetPredicate = HasDot5Insts in { +multiclass VOP2Only_Real_DOT_ACC_gfx10<bits<6> op> : VOP2_Real_dpp_gfx10<op>, + VOP2_Real_dpp8_gfx10<op> { + let IsSingle = 1 in + defm NAME : VOP2_Real_e32_gfx10<op>; +} + +let OtherPredicates = [HasDot5Insts] in { defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>; // NB: Opcode conflicts with V_DOT8C_I32_I4 // This opcode exists in gfx 10.1* only - defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>; + defm V_DOT2C_F32_F16 : VOP2Only_Real_DOT_ACC_gfx10<0x02>; } -let SubtargetPredicate = HasDot6Insts in { +let OtherPredicates = [HasDot6Insts] in { defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>; - defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>; + defm V_DOT4C_I32_I8 : VOP2Only_Real_DOT_ACC_gfx10<0x0d>; } -let SubtargetPredicate = HasDot4Insts in { +let OtherPredicates = [HasDot4Insts] in { defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>; } -let SubtargetPredicate = HasDot3Insts in { +let OtherPredicates = [HasDot3Insts] in { defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td index eebd323210f9..713b4712d563 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -600,7 +600,7 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32 let SubtargetPredicate = isGFX940Plus in defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>; -let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, +let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in { defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>; @@ -1611,7 +1611,7 @@ defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>; defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>; -let OtherPredicates = [HasFP8Insts] in { +let OtherPredicates = [HasFP8ConversionInsts] in { defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>; defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>; defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index e9d6f67aee16..0c7a08cd4bc9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -415,8 +415,8 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { null_frag, 1>; // Dot-iu instructions consider input as signed if imod neg bits are set. Thus // Dot-iu Intrinsics have extra operands and require separate codegen pattern. - def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0, - (DotIUVOP3PMods i32:$src1_mods), i32:$src1, + def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0, + (VOP3PModsNeg i32:$src1_mods), i32:$src1, i32:$src2, (i1 timm:$clamp)), (!cast<Instruction>(NAME) $src0_mods, i32:$src0, $src1_mods, i32:$src1, @@ -443,6 +443,48 @@ def : GCNPat < (int_amdgcn_sdot4 i32:$src0, >; } // End SubtargetPredicate = HasDot8Insts +// Does not use opsel, no src_modifiers on src0 and src1. +// src_modifiers on src2(f32) are f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]). +def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>, + VOP3_PACKED, 1> { + let HasClamp = 0; + let HasOpSel = 0; + let HasOMod = 0; + let IsDOT = 1; + let HasSrc0Mods = 0; + let HasSrc1Mods = 0; + let HasSrc2Mods = 1; + + let InsVOP3P = (ins VSrc_b32:$src0, VSrc_b32:$src1, + PackedF16InputMods:$src2_modifiers, VSrc_f32:$src2, + neg_lo0:$neg_lo, neg_hi0:$neg_hi); + + let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, + PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, + neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, FI:$fi); + + let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1, + PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2, + neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl, + row_mask:$row_mask, bank_mask:$bank_mask, + bound_ctrl:$bound_ctrl, FI:$fi); +} + +multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> { + defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>; + + let SubtargetPredicate = isGFX12Plus in + def : GCNPat <(intrinsic_node i32:$src0, i32:$src1, + (VOP3Mods f32:$src2, i32:$src2_modifiers)), + (!cast<Instruction>(NAME) i32:$src0, i32:$src1, + i32:$src2_modifiers, f32:$src2)>; +} + +defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_dot4_f32_fp8_bf8>; +defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_dot4_f32_bf8_fp8>; +defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_dot4_f32_fp8_fp8>; +defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_dot4_f32_bf8_bf8>; + def : UDot2Pat<V_DOT2_U32_U16>; def : SDot2Pat<V_DOT2_I32_I16>; @@ -593,13 +635,29 @@ class MAIFrag<SDPatternOperator Op, code pred> : PatFrag < pred >; -let GISelPredicateCode = [{ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in -class AgprMAIFrag<SDPatternOperator Op> : - MAIFrag<Op, [{ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>; +defvar MayNeedAGPRs = [{ + return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); +}]; + +defvar MayNeedAGPRs_gisel = [{ + return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); +}]; + +defvar MayNotNeedAGPRs = [{ + return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); +}]; + +defvar MayNotNeedAGPRs_gisel = [{ + return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); +}]; -let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in -class VgprMAIFrag<SDPatternOperator Op> : - MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>; +class AgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNeedAGPRs> { + let GISelPredicateCode = MayNeedAGPRs_gisel; +} + +class VgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNotNeedAGPRs> { + let GISelPredicateCode = MayNotNeedAGPRs_gisel; +} let SubtargetPredicate = HasMAIInsts in { @@ -812,8 +870,8 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : GCNPat < (P.DstVT (node - (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), - (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), + (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), + (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) )), (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) @@ -1003,6 +1061,11 @@ defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_m defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; +defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>; +defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>; +defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>; +defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>; + //===----------------------------------------------------------------------===// // GFX11 //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td index e5b801048e6d..3ca97f0291e0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -61,13 +61,13 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt let AsmDPP16 = AsmDPP#"$fi"; // VOPC DPP Instructions do not need an old operand let TieRegDPP = ""; - let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + let InsDPP = getInsDPP<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP, 0/*HasOld*/>.ret; - let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP, 0/*HasOld*/>.ret; - let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP, 0/*HasOld*/>.ret; @@ -88,10 +88,10 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType def NAME : VOPC_Profile<sched, vt0, vt1>; def _t16 : VOPC_Profile<sched, vt0, vt1> { let IsTrue16 = 1; - let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; @@ -108,8 +108,8 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0, let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, src0_sel:$src0_sel, src1_sel:$src1_sel); - let AsmVOP3Base = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp", - "$src0, $src1"); + let AsmVOP3Base = !if(Src0VT.isFP, "$src0_modifiers, $src1_modifiers$clamp", + "$src0, $src1"); let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let EmitDst = 0; } @@ -118,10 +118,10 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va def NAME : VOPC_NoSdst_Profile<sched, vt0, vt1>; def _t16 : VOPC_NoSdst_Profile<sched, vt0, vt1> { let IsTrue16 = 1; - let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; @@ -146,7 +146,7 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[], let mayStore = 0; let hasSideEffects = 0; - let ReadsModeReg = isFloatType<P.Src0VT>.ret; + let ReadsModeReg = P.Src0VT.isFP; let VALU = 1; let VOPC = 1; @@ -789,11 +789,11 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> { def NAME : VOPC_Class_Profile<sched, f16>; def _t16 : VOPC_Class_Profile<sched, f16, i16> { let IsTrue16 = 1; - let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; + let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src1RC64 = VSrc_b32; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; @@ -816,11 +816,11 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> { def NAME : VOPC_Class_NoSdst_Profile<sched, f16>; def _t16 : VOPC_Class_NoSdst_Profile<sched, f16, i16> { let IsTrue16 = 1; - let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; + let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src1RC64 = VSrc_b32; - let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; - let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; - let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; + let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret; let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td index c4b9e7063093..df505c3365cb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -152,7 +152,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let ClampLo = P.HasClampLo; let ClampHi = P.HasClampHi; - let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); let mayRaiseFPException = ReadsModeReg; let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); @@ -169,6 +169,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : VOP3_Pseudo<opName, P, pattern, 1> { let VOP3P = 1; + let IsDOT = P.IsDOT; } class VOP_Real<VOP_Pseudo ps> { @@ -387,7 +388,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 { let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) - let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); @@ -396,8 +397,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 { let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); let Inst{58-50} = !if(P.HasSrc2, src2, 0); - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1) + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1) let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) @@ -599,7 +600,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : let VALU = 1; let SDWA = 1; - let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); let mayRaiseFPException = ReadsModeReg; let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); @@ -772,12 +773,12 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 { let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) - let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); let Inst{22-16} = op; let Inst{31-23} = 0x198; // encoding - let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0) - let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1) + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1) let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) @@ -811,7 +812,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], let DPP = 1; let Size = 8; - let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP); let mayRaiseFPException = ReadsModeReg; let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); |
