diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-18 20:30:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-04-06 20:11:55 +0000 |
commit | 5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch) | |
tree | 1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Target/AMDGPU | |
parent | 3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff) | |
parent | 312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
161 files changed, 17573 insertions, 7907 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h index c25194c02f72..35d33cb60bc4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -12,10 +12,12 @@ #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/CodeGen.h" namespace llvm { +class AMDGPUTargetMachine; class TargetMachine; // GlobalISel passes @@ -34,6 +36,7 @@ FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -41,25 +44,32 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); FunctionPass *createSIPostRABundlerPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); -FunctionPass *createAMDGPUUseNativeCallsPass(); +FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *); ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); -ModulePass *createAMDGPULowerModuleLDSPass(); +ModulePass * +createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> { - AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} + AMDGPUSimplifyLibCallsPass() {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +struct AMDGPUImageIntrinsicOptimizerPass + : PassInfoMixin<AMDGPUImageIntrinsicOptimizerPass> { + AMDGPUImageIntrinsicOptimizerPass(TargetMachine &TM) : TM(TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); private: @@ -78,8 +88,8 @@ extern char &AMDGPUMachineCFGStructurizerID; void initializeAMDGPUAlwaysInlinePass(PassRegistry&); Pass *createAMDGPUAnnotateKernelFeaturesPass(); -Pass *createAMDGPUAttributorPass(); -void initializeAMDGPUAttributorPass(PassRegistry &); +Pass *createAMDGPUAttributorLegacyPass(); +void initializeAMDGPUAttributorLegacyPass(PassRegistry &); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; @@ -116,10 +126,13 @@ struct AMDGPULowerKernelAttributesPass PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; -void initializeAMDGPULowerModuleLDSPass(PassRegistry &); -extern char &AMDGPULowerModuleLDSID; +void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &); +extern char &AMDGPULowerModuleLDSLegacyPassID; struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> { + const AMDGPUTargetMachine &TM; + AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; @@ -144,9 +157,15 @@ extern char &SIFixSGPRCopiesID; void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSILowerWWMCopiesPass(PassRegistry &); +extern char &SILowerWWMCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &); +extern char &AMDGPUGlobalISelDivergenceLoweringID; + void initializeSILowerSGPRSpillsPass(PassRegistry &); extern char &SILowerSGPRSpillsID; @@ -171,15 +190,15 @@ extern char &SIOptimizeExecMaskingID; void initializeSIPreAllocateWWMRegsPass(PassRegistry &); extern char &SIPreAllocateWWMRegsID; -void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); -extern char &AMDGPUSimplifyLibCallsID; - -void initializeAMDGPUUseNativeCallsPass(PassRegistry &); -extern char &AMDGPUUseNativeCallsID; +void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &); +extern char &AMDGPUImageIntrinsicOptimizerID; void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; +void initializeGCNRegPressurePrinterPass(PassRegistry &); +extern char &GCNRegPressurePrinterID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); @@ -217,8 +236,7 @@ private: }; Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &TM, - CodeGenOpt::Level OptLevel); +FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel); ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> { @@ -239,6 +257,25 @@ public: PreservedAnalyses run(Function &, FunctionAnalysisManager &); }; +class AMDGPULowerKernelArgumentsPass + : public PassInfoMixin<AMDGPULowerKernelArgumentsPass> { +private: + TargetMachine &TM; + +public: + AMDGPULowerKernelArgumentsPass(TargetMachine &TM) : TM(TM){}; + PreservedAnalyses run(Function &, FunctionAnalysisManager &); +}; + +class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> { +private: + TargetMachine &TM; + +public: + AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){}; + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + FunctionPass *createAMDGPUAnnotateUniformValues(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -279,9 +316,16 @@ extern char &AMDGPURemoveIncompatibleFunctionsID; void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); extern char &AMDGPULateCodeGenPrepareID; -FunctionPass *createAMDGPURewriteUndefForPHIPass(); -void initializeAMDGPURewriteUndefForPHIPass(PassRegistry &); -extern char &AMDGPURewriteUndefForPHIPassID; +FunctionPass *createAMDGPURewriteUndefForPHILegacyPass(); +void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &); +extern char &AMDGPURewriteUndefForPHILegacyPassID; + +class AMDGPURewriteUndefForPHIPass + : public PassInfoMixin<AMDGPURewriteUndefForPHIPass> { +public: + AMDGPURewriteUndefForPHIPass() = default; + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; @@ -295,6 +339,9 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; +void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &); +extern char &AMDGPUInsertSingleUseVDSTID; + void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; @@ -347,72 +394,6 @@ enum TargetIndex { TI_SCRATCH_RSRC_DWORD2, TI_SCRATCH_RSRC_DWORD3 }; -} - -/// OpenCL uses address spaces to differentiate between -/// various memory regions on the hardware. On the CPU -/// all of the address spaces point to the same memory, -/// however on the GPU, each address space points to -/// a separate piece of memory that is unique from other -/// memory locations. -namespace AMDGPUAS { -enum : unsigned { - // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 8, - - FLAT_ADDRESS = 0, ///< Address space for flat memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) - - CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). - LOCAL_ADDRESS = 3, ///< Address space for local memory. - PRIVATE_ADDRESS = 5, ///< Address space for private memory. - - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. - - BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. - ///< Not used in backend. - - BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources. - - /// Internal address spaces. Can be freely renumbered. - STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers. - /// end Internal address spaces. - - /// Address space for direct addressable parameter memory (CONST0). - PARAM_D_ADDRESS = 6, - /// Address space for indirect addressable parameter memory (VTX1). - PARAM_I_ADDRESS = 7, - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on - // this order to be able to dynamically index a constant buffer, for - // example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, - - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u, -}; -} - -namespace AMDGPU { // FIXME: Missing constant_32bit inline bool isFlatGlobalAddrSpace(unsigned AS) { @@ -429,24 +410,25 @@ inline bool isExtendedGlobalAddrSpace(unsigned AS) { } static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) { - static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range"); + static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 9, "Addr space out of range"); if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) return true; - // This array is indexed by address space value enum elements 0 ... to 8 + // This array is indexed by address space value enum elements 0 ... to 9 // clang-format off - static const bool ASAliasRules[9][9] = { - /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */ - /* Flat */ {true, true, false, true, true, true, true, true, true}, - /* Global */ {true, true, false, false, true, false, true, true, true}, - /* Region */ {false, false, true, false, false, false, false, false, false}, - /* Group */ {true, false, false, true, false, false, false, false, false}, - /* Constant */ {true, true, false, false, false, false, true, true, true}, - /* Private */ {true, false, false, false, false, true, false, false, false}, - /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true}, - /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true}, - /* Buffer Resource */ {true, true, false, false, true, false, true, true, true}, + static const bool ASAliasRules[10][10] = { + /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */ + /* Flat */ {true, true, false, true, true, true, true, true, true, true}, + /* Global */ {true, true, false, false, true, false, true, true, true, true}, + /* Region */ {false, false, true, false, false, false, false, false, false, false}, + /* Group */ {true, false, false, true, false, false, false, false, false, false}, + /* Constant */ {true, true, false, false, false, false, true, true, true, true}, + /* Private */ {true, false, false, false, false, true, false, false, false, false}, + /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true}, + /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true}, + /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true}, + /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true}, }; // clang-format on diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td index b178623a319d..060fb66d38f7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -281,6 +281,12 @@ def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug", "MAD_U64/I64 intra instruction forwarding bug" >; +def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug", + "HasMSAALoadDstSelBug", + "true", + "MSAA loads not honoring dst_sel bug" +>; + class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -339,6 +345,12 @@ def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts", "Additional instructions for GFX11+" >; +def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts", + "GFX12Insts", + "true", + "Additional instructions for GFX12+" +>; + def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts", "GFX10_3Insts", "true", @@ -375,6 +387,12 @@ def FeatureTrue16BitInsts : SubtargetFeature<"true16", "True 16-bit operand instructions" >; +def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", + "EnableRealTrue16Insts", + "true", + "Use true 16-bit registers" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -393,6 +411,12 @@ def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode", "Has VGPR mode register indexing" >; +def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads", + "HasScalarDwordx3Loads", + "true", + "Has 96-bit scalar load instructions" +>; + def FeatureScalarStores : SubtargetFeature<"scalar-stores", "HasScalarStores", "true", @@ -454,10 +478,16 @@ def FeatureDPP8 : SubtargetFeature<"dpp8", "Support DPP8 (Data Parallel Primitives) extension" >; -def Feature64BitDPP : SubtargetFeature<"dpp-64bit", - "Has64BitDPP", +def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit", + "HasDPALU_DPP", "true", - "Support DPP (Data Parallel Primitives) extension" + "Support DPP (Data Parallel Primitives) extension in DP ALU" +>; + +def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr", + "HasDPPSrc1SGPR", + "true", + "Support SGPR for Src1 of DPP instructions" >; def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops", @@ -675,6 +705,13 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf [FeatureFlatGlobalInsts] >; +def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts", + "HasAtomicCSubNoRtnInsts", + "true", + "Has buffer_atomic_csub and global_atomic_csub instructions that don't " + "return original value" +>; + def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", @@ -773,6 +810,30 @@ def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", "Has SC0 and SC1 on stores" >; +def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", + "HasSALUFloatInsts", + "true", + "Has SALU floating point instructions" +>; + +def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint", + "HasVGPRSingleUseHintInsts", + "true", + "Has single-use VGPR hint instructions" +>; + +def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans", + "HasPseudoScalarTrans", + "true", + "Has Pseudo Scalar Transcendental instructions" +>; + +def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset", + "HasRestrictedSOffset", + "true", + "Has restricted SOffset (immediate not supported)." +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -872,6 +933,12 @@ def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", "Requires use of fract on arguments to trig instructions" >; +def FeatureKernargPreload : SubtargetFeature <"kernarg-preload", + "KernargPreload", + "true", + "Hardware supports preloading of kernel arguments in user SGPRs." +>; + // Alignment enforcement is controlled by a configuration register: // SH_MEM_CONFIG.alignment_mode def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode", @@ -899,6 +966,18 @@ def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs", "Enable the architected SGPRs" >; +def FeatureGDS : SubtargetFeature<"gds", + "HasGDS", + "true", + "Has Global Data Share" +>; + +def FeatureGWS : SubtargetFeature<"gws", + "HasGWS", + "true", + "Has Global Wave Sync" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -917,7 +996,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts + FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, + FeatureGDS, FeatureGWS ] >; @@ -928,7 +1008,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, - FeatureImageInsts + FeatureImageInsts, FeatureGDS, FeatureGWS ] >; @@ -943,7 +1023,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess, FeatureImageInsts + FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS ] >; @@ -961,7 +1041,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug + FeatureNegativeScratchOffsetBug, FeatureGWS ] >; @@ -980,7 +1060,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, + FeatureGDS, FeatureGWS ] >; @@ -999,7 +1080,28 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, + FeatureGWS + ] +>; + +def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", + "gfx12", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, Feature16BitInsts, + FeatureInv2PiInlineImm, FeatureApertureRegs, + FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, + FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, + FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3P, FeatureVOPD, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, + FeatureNoSdstCMPX, FeatureVscnt, + FeatureVOP3Literal, FeatureDPP8, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, + FeatureA16, FeatureFastDenormalF32, FeatureG16, + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, + FeatureGWS, FeatureTrue16BitInsts ] >; @@ -1104,28 +1206,32 @@ def FeatureISAVersion9_0_MI_Common : FeatureSet< def FeatureISAVersion9_0_0 : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureMadMixInsts, + [FeatureGDS, + FeatureMadMixInsts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_2 : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureMadMixInsts, + [FeatureGDS, + FeatureMadMixInsts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_4 : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureDsSrc2Insts, + [FeatureGDS, + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFmaMixInsts, FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_6 : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [HalfRate64Ops, + [FeatureGDS, + HalfRate64Ops, FeatureFmaMixInsts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, @@ -1139,7 +1245,8 @@ def FeatureISAVersion9_0_6 : FeatureSet< def FeatureISAVersion9_0_8 : FeatureSet< !listconcat(FeatureISAVersion9_0_MI_Common.Features, - [HalfRate64Ops, + [FeatureGDS, + HalfRate64Ops, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, @@ -1148,7 +1255,8 @@ def FeatureISAVersion9_0_8 : FeatureSet< def FeatureISAVersion9_0_9 : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureMadMixInsts, + [FeatureGDS, + FeatureMadMixInsts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureImageInsts, @@ -1158,17 +1266,19 @@ def FeatureISAVersion9_0_A : FeatureSet< !listconcat(FeatureISAVersion9_0_MI_Common.Features, [FeatureGFX90AInsts, FeatureFmacF64Inst, - Feature64BitDPP, + FeatureDPALU_DPP, FeaturePackedFP32Ops, FeatureAtomicFaddRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts, FeaturePackedTID, FullRate64Ops, - FeatureBackOffBarrier])>; + FeatureBackOffBarrier, + FeatureKernargPreload])>; def FeatureISAVersion9_0_C : FeatureSet< !listconcat(FeatureISAVersion9_0_Common.Features, - [FeatureMadMixInsts, + [FeatureGDS, + FeatureMadMixInsts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureImageGather4D16Bug])>; @@ -1191,7 +1301,7 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureDot10Insts, FeatureAtomicDsPkAdd16Insts, FeatureAtomicFlatPkAdd16Insts, - Feature64BitDPP, + FeatureDPALU_DPP, FeaturePackedFP32Ops, FeatureMAIInsts, FeatureFP8Insts, @@ -1205,7 +1315,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeaturePackedTID, FeatureArchitectedFlatScratch, FullRate64Ops, - FeatureBackOffBarrier]>; + FeatureBackOffBarrier, + FeatureKernargPreload]>; def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, @@ -1313,7 +1424,8 @@ def FeatureISAVersion11_Common : FeatureSet< def FeatureISAVersion11_0_Common : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [FeatureVALUTransUseHazard])>; + [FeatureMSAALoadDstSelBug, + FeatureVALUTransUseHazard])>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_0_Common.Features, @@ -1334,11 +1446,44 @@ def FeatureISAVersion11_0_3 : FeatureSet< def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [])>; + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, + FeatureVGPRSingleUseHintInsts])>; def FeatureISAVersion11_5_1 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, - [FeatureGFX11FullVGPRs])>; + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, + FeatureVGPRSingleUseHintInsts, + FeatureGFX11FullVGPRs])>; + +def FeatureISAVersion12 : FeatureSet< + [FeatureGFX12, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot5Insts, + FeatureDot7Insts, + FeatureDot8Insts, + FeatureDot9Insts, + FeatureDot10Insts, + FeatureNSAEncoding, + FeaturePartialNSAEncoding, + FeatureWavefrontSize32, + FeatureShaderCyclesRegister, + FeatureArchitectedFlatScratch, + FeatureAtomicFaddRtnInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureFlatAtomicFaddF32Inst, + FeatureImageInsts, + FeatureExtendedImageInsts, + FeaturePackedTID, + FeatureVcmpxPermlaneHazard, + FeatureSALUFloatInsts, + FeaturePseudoScalarTrans, + FeatureHasRestrictedSOffset, + FeatureVGPRSingleUseHintInsts, + FeatureMADIntraFwdBug, + FeatureScalarDwordx3Loads]>; //===----------------------------------------------------------------------===// @@ -1494,6 +1639,10 @@ def isGFX6GFX7GFX8GFX9GFX10 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of (not FeatureGFX11Insts))>; +def isNotGFX12Plus : + Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(all_of (not FeatureGFX12Insts))>; + def isGFX7GFX8GFX9GFX10 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" @@ -1501,6 +1650,13 @@ def isGFX7GFX8GFX9GFX10 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of FeatureCIInsts, (not FeatureGFX11Insts))>; +def isGFX8GFX9GFX10GFX11 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX12Insts))>; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<(all_of FeatureCIInsts)>; @@ -1573,6 +1729,11 @@ def isGFX10Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of FeatureGFX10Insts)>; +def isGFX10GFX11 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(all_of FeatureGFX10Insts, (not FeatureGFX12Insts))>; + def isGFX10Before1030 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 &&" "!Subtarget->hasGFX10_3Insts()">, @@ -1591,12 +1752,20 @@ def isGFX8GFX9GFX10 : def isGFX11Only : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, - AssemblerPredicate<(all_of FeatureGFX11Insts)>; + AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX12Insts))>; def isGFX11Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">, AssemblerPredicate<(all_of FeatureGFX11Insts)>; +def isGFX12Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; + +def isGFX12Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">, + AssemblerPredicate<(all_of FeatureGFX12Insts)>; + def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; @@ -1625,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>; +def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">, + AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>; +def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">, + AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>; + def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>; @@ -1650,6 +1824,15 @@ def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">, AssemblerPredicate<(all_of FeatureTrue16BitInsts)>; def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">; +// Control use of True16 instructions. The real True16 instructions are +// True16 instructions as they are defined in the ISA. Fake True16 +// instructions have the same encoding as real ones but syntactically +// only allow 32-bit registers in operands and use low halves thereof. +def UseRealTrue16Insts : Predicate<"Subtarget->useRealTrue16Insts()">, + AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>; +def UseFakeTrue16Insts : Predicate<"Subtarget->hasTrue16BitInsts() && " + "!Subtarget->useRealTrue16Insts()">; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; @@ -1677,12 +1860,15 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">, def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>; -def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">, - AssemblerPredicate<(all_of Feature64BitDPP)>; +def HasDPALU_DPP : Predicate<"Subtarget->hasDPALU_DPP()">, + AssemblerPredicate<(all_of FeatureDPALU_DPP)>; def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">, AssemblerPredicate<(all_of FeaturePackedFP32Ops)>; +def HasPkMovB32 : Predicate<"Subtarget->hasPkMovB32()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + def HasFmaakFmamkF32Insts : Predicate<"Subtarget->hasFmaakFmamkF32Insts()">, AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>; @@ -1836,6 +2022,26 @@ def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">; def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; +def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">, + AssemblerPredicate<(all_of FeatureSALUFloatInsts)>; + +def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">, + AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>; + +def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, + AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>; + +def HasGDS : Predicate<"Subtarget->hasGDS()">; + +def HasGWS : Predicate<"Subtarget->hasGWS()">; + +def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">; +def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">; + +def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">; + +def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 63942414bf3c..8d3eac686831 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -93,8 +93,7 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, } } - // Forward the query to the next alias analysis. - return AAResultBase::alias(LocA, LocB, AAQI, nullptr); + return AliasResult::MayAlias; } ModRefInfo AMDGPUAAResult::getModRefInfoMask(const MemoryLocation &Loc, @@ -111,5 +110,5 @@ ModRefInfo AMDGPUAAResult::getModRefInfoMask(const MemoryLocation &Loc, AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return ModRefInfo::NoModRef; - return AAResultBase::getModRefInfoMask(Loc, AAQI, IgnoreLocals); + return ModRefInfo::ModRef; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index 1c18cbd855fc..de25f9241a50 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -60,6 +60,7 @@ bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) { return false; } +// TODO: Print preload kernargs? void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { for (const auto &FI : ArgInfoMap) { OS << "Arguments for " << FI.first->getName() << '\n' @@ -148,7 +149,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue( llvm_unreachable("unexpected preloaded value type"); } -constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { +AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { AMDGPUFunctionArgInfo AI; AI.PrivateSegmentBuffer = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index f595e469f998..42b33c50d9f8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" #include "llvm/Pass.h" @@ -37,22 +38,19 @@ private: bool IsSet : 1; public: - constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, - bool IsStack = false, bool IsSet = false) - : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, + bool IsSet = false) + : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} - static constexpr ArgDescriptor createRegister(Register Reg, - unsigned Mask = ~0u) { + static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { return ArgDescriptor(Reg, Mask, false, true); } - static constexpr ArgDescriptor createStack(unsigned Offset, - unsigned Mask = ~0u) { + static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { return ArgDescriptor(Offset, Mask, true, true); } - static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, - unsigned Mask) { + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } @@ -94,7 +92,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { return OS; } +struct KernArgPreloadDescriptor : public ArgDescriptor { + KernArgPreloadDescriptor() {} + SmallVector<MCRegister> Regs; +}; + struct AMDGPUFunctionArgInfo { + // clang-format off enum PreloadedValue { // SGPRS: PRIVATE_SEGMENT_BUFFER = 0, @@ -117,6 +121,7 @@ struct AMDGPUFunctionArgInfo { WORKITEM_ID_Z = 19, FIRST_VGPR_VALUE = WORKITEM_ID_X }; + // clang-format on // Kernel input registers setup for the HSA ABI in allocation order. @@ -151,10 +156,13 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; + // Map the index of preloaded kernel arguments to its descriptor. + SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{}; + std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> getPreloadedValue(PreloadedValue Value) const; - static constexpr AMDGPUFunctionArgInfo fixedABILayout(); + static AMDGPUFunctionArgInfo fixedABILayout(); }; class AMDGPUArgumentUsageInfo : public ImmutablePass { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 7cd8e53e6521..d317a733d433 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -121,26 +121,13 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { TM.getTargetTriple().getOS() != Triple::AMDPAL) return; - if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) - getTargetStreamer()->EmitDirectiveAMDGCNTarget(); + getTargetStreamer()->EmitDirectiveAMDGCNTarget(); if (TM.getTargetTriple().getOS() == Triple::AMDHSA) HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) getTargetStreamer()->getPALMetadata()->readFromIR(M); - - if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) - return; - - // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. - if (TM.getTargetTriple().getOS() == Triple::AMDHSA) - getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); - - // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2. - IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); - getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2( - Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { @@ -148,8 +135,7 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { if (!IsTargetStreamerInitialized) initTargetStreamer(M); - if (TM.getTargetTriple().getOS() != Triple::AMDHSA || - CodeObjectVersion == AMDGPU::AMDHSA_COV2) + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) getTargetStreamer()->EmitISAVersion(); // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). @@ -162,20 +148,6 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { } } -bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( - const MachineBasicBlock *MBB) const { - if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) - return false; - - if (MBB->empty()) - return true; - - // If this is a block implementing a long branch, an expression relative to - // the start of the block is needed. to the start of the block. - // XXX - Is there a smarter way to check this? - return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); -} - void AMDGPUAsmPrinter::emitFunctionBodyStart() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); @@ -209,7 +181,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { if (!MFI.isEntryFunction()) return; - if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) && + if (STM.isMesaKernel(F) && (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL)) { amd_kernel_code_t KernelCode; @@ -219,6 +191,11 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { if (STM.isAmdHsaOS()) HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); + + if (MFI.getNumKernargPreloadedSGPRs() > 0) { + assert(AMDGPU::hasKernargPreload(STM)); + getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI()); + } } void AMDGPUAsmPrinter::emitFunctionBodyEnd() { @@ -226,8 +203,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { if (!MFI.isEntryFunction()) return; - if (TM.getTargetTriple().getOS() != Triple::AMDHSA || - CodeObjectVersion == AMDGPU::AMDHSA_COV2) + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; auto &Streamer = getTargetStreamer()->getStreamer(); @@ -260,9 +236,23 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { Streamer.popSection(); } +void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { + Register RegNo = MI->getOperand(0).getReg(); + + SmallString<128> Str; + raw_svector_ostream OS(Str); + OS << "implicit-def: " + << printReg(RegNo, MF->getSubtarget().getRegisterInfo()); + + if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) + OS << " : SGPR spill to VGPR lane"; + + OutStreamer->AddComment(OS.str()); + OutStreamer->addBlankLine(); +} + void AMDGPUAsmPrinter::emitFunctionEntryLabel() { - if (TM.getTargetTriple().getOS() == Triple::AMDHSA && - CodeObjectVersion >= AMDGPU::AMDHSA_COV3) { + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { AsmPrinter::emitFunctionEntryLabel(); return; } @@ -337,12 +327,6 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { - case AMDGPU::AMDHSA_COV2: - HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2()); - break; - case AMDGPU::AMDHSA_COV3: - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); - break; case AMDGPU::AMDHSA_COV4: HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); break; @@ -393,28 +377,29 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); uint16_t KernelCodeProperties = 0; + const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - if (MFI.hasPrivateSegmentBuffer()) { + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } - if (MFI.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { + if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } - if (MFI.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; } - if (MFI.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; } - if (MFI.hasFlatScratchInit()) { + if (UserSGPRInfo.hasFlatScratchInit()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } @@ -435,12 +420,13 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( const SIProgramInfo &PI) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const Function &F = MF.getFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); amdhsa::kernel_descriptor_t KernelDescriptor; memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); assert(isUInt<32>(PI.ScratchSize)); - assert(isUInt<32>(PI.getComputePGMRSrc1())); + assert(isUInt<32>(PI.getComputePGMRSrc1(STM))); assert(isUInt<32>(PI.getComputePGMRSrc2())); KernelDescriptor.group_segment_fixed_size = PI.LDSSize; @@ -449,7 +435,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( Align MaxKernArgAlign; KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); - KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); + KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM); KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(); KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); @@ -458,6 +444,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; + if (AMDGPU::hasKernargPreload(STM)) + KernelDescriptor.kernarg_preload = + static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs()); + return KernelDescriptor; } @@ -949,6 +939,17 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU); + const auto [MinWEU, MaxWEU] = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); + if (ProgInfo.Occupancy < MinWEU) { + DiagnosticInfoOptimizationFailure Diag( + F, F.getSubprogram(), + "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " + "'" + + F.getName() + "': desired occupancy was " + Twine(MinWEU) + + ", final occupancy is " + Twine(ProgInfo.Occupancy)); + F.getContext().diagnose(Diag); + } } static unsigned getRsrcReg(CallingConv::ID CallConv) { @@ -973,7 +974,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); - OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); + OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM)); OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2()); @@ -1037,7 +1038,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); if (MD->getPALMajorVersion() < 3) { - MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); + MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM)); if (AMDGPU::isCompute(CC)) { MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); } else { @@ -1113,17 +1114,19 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { auto *MD = getTargetStreamer()->getPALMetadata(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - MD->setFunctionScratchSize(MF, MFI.getStackSize()); + StringRef FnName = MF.getFunction().getName(); + MD->setFunctionScratchSize(FnName, MFI.getStackSize()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // Set compute registers MD->setRsrc1(CallingConv::AMDGPU_CS, - CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); + CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST)); MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2()); // Set optional info - MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); - MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU); - MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU); + MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); + MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); + MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); } // This is supposed to be log2(Size) @@ -1153,7 +1156,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); Out.compute_pgm_resource_registers = - CurrentProgramInfo.getComputePGMRSrc1() | + CurrentProgramInfo.getComputePGMRSrc1(STM) | (CurrentProgramInfo.getComputePGMRSrc2() << 32); Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; @@ -1164,27 +1167,28 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); - if (MFI->hasPrivateSegmentBuffer()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } - if (MFI->hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) + if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; - if (MFI->hasKernargSegmentPtr()) + if (UserSGPRInfo.hasKernargSegmentPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; - if (MFI->hasDispatchID()) + if (UserSGPRInfo.hasDispatchID()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; - if (MFI->hasFlatScratchInit()) + if (UserSGPRInfo.hasFlatScratchInit()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - if (MFI->hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (STM.isXNACKEnabled()) @@ -1293,6 +1297,9 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR); EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", CurrentProgramInfo.ScratchSize); + StringRef DynamicStackStr = + CurrentProgramInfo.DynamicCallStack ? "True" : "False"; + EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", CurrentProgramInfo.Occupancy); EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index d490209ce35e..79326cd3d328 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -116,6 +116,8 @@ public: void emitFunctionBodyEnd() override; + void emitImplicitDef(const MachineInstr *MI) const override; + void emitFunctionEntryLabel() override; void emitBasicBlockStart(const MachineBasicBlock &MBB) override; @@ -126,9 +128,6 @@ public: void emitEndOfAsmFile(Module &M) override; - bool isBlockOnlyReachableByFallthrough( - const MachineBasicBlock *MBB) const override; - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 9795928094f4..9ba74a23e8af 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -202,9 +202,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: break; } + // Only 32-bit floating point atomic ops are supported. + if (AtomicRMWInst::isFPOperation(Op) && !I.getType()->isFloatTy()) { + return; + } + const unsigned PtrIdx = 0; const unsigned ValIdx = 1; @@ -344,8 +353,12 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, llvm_unreachable("Unhandled atomic op"); case AtomicRMWInst::Add: return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::FAdd: + return B.CreateFAdd(LHS, RHS); case AtomicRMWInst::Sub: return B.CreateBinOp(Instruction::Sub, LHS, RHS); + case AtomicRMWInst::FSub: + return B.CreateFSub(LHS, RHS); case AtomicRMWInst::And: return B.CreateBinOp(Instruction::And, LHS, RHS); case AtomicRMWInst::Or: @@ -365,6 +378,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, case AtomicRMWInst::UMin: Pred = CmpInst::ICMP_ULT; break; + case AtomicRMWInst::FMax: + return B.CreateMaxNum(LHS, RHS); + case AtomicRMWInst::FMin: + return B.CreateMinNum(LHS, RHS); } Value *Cond = B.CreateICmp(Pred, LHS, RHS); return B.CreateSelect(Cond, LHS, RHS); @@ -376,10 +393,11 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const { - Type *const Ty = V->getType(); + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); // Reduce within each row of 16 lanes. for (unsigned Idx = 0; Idx < 4; Idx++) { @@ -392,39 +410,47 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Reduce within each pair of rows (i.e. 32 lanes). assert(ST->hasPermLaneX16()); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateIntrinsic( - Intrinsic::amdgcn_permlanex16, {}, - {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()})); - - if (ST->isWave32()) + V = B.CreateBitCast(V, IntNTy); + Value *Permlanex16Call = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), + B.CreateBitCast(Permlanex16Call, AtomicTy)); + if (ST->isWave32()) { return V; + } if (ST->hasPermLane64()) { // Reduce across the upper and lower 32 lanes. - return buildNonAtomicBinOp( - B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); + V = B.CreateBitCast(V, IntNTy); + Value *Permlane64Call = + B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V); + return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), + B.CreateBitCast(Permlane64Call, AtomicTy)); } // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); - Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); - return buildNonAtomicBinOp(B, Op, Lane0, Lane32); + V = B.CreateBitCast(V, IntNTy); + Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); + Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); + return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy), + B.CreateBitCast(Lane32, AtomicTy)); } // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, - Value *const Identity) const { - Type *const Ty = V->getType(); + Value *Identity) const { + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); + Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -452,23 +478,29 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). assert(ST->hasPermLaneX16()); - Value *const PermX = B.CreateIntrinsic( + V = B.CreateBitCast(V, IntNTy); + Value *PermX = B.CreateIntrinsic( Intrinsic::amdgcn_permlanex16, {}, {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), - B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); + + Value *UpdateDPPCall = + B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy), + B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa), + B.getInt32(0xf), B.getFalse()}); + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall); + if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. + V = B.CreateBitCast(V, IntNTy); Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, B.getInt32(31)}); - V = buildNonAtomicBinOp( - B, Op, V, - B.CreateCall(UpdateDPP, - {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), - B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); + + Value *UpdateDPPCall = B.CreateCall( + UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}); + + V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), + UpdateDPPCall); } } return V; @@ -477,12 +509,13 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, // Use the builder to create a shift right of V across the wavefront, with all // lanes active, to turn an inclusive scan into an exclusive scan. Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, - Value *const Identity) const { - Type *const Ty = V->getType(); + Value *Identity) const { + Type *AtomicTy = V->getType(); + Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits()); + Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. V = B.CreateCall(UpdateDPP, @@ -502,19 +535,24 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); // Copy the old lane 15 to the new lane 16. - V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), - B.getInt32(16), V}); - + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}), + B.getInt32(16), B.CreateBitCast(V, IntNTy)}); + V = B.CreateBitCast(V, AtomicTy); if (!ST->isWave32()) { // Copy the old lane 31 to the new lane 32. - V = B.CreateCall( - WriteLane, - {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); + V = B.CreateBitCast(V, IntNTy); + V = B.CreateCall(WriteLane, + {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), + B.getInt32(31)}), + B.getInt32(32), V}); // Copy the old lane 47 to the new lane 48. V = B.CreateCall( WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); + V = B.CreateBitCast(V, AtomicTy); } } @@ -529,7 +567,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { - auto *Ty = I.getType(); auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); auto *EntryBB = I.getParent(); @@ -554,18 +591,25 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( // Use llvm.cttz instrinsic to find the lowest remaining active lane. auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); - auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); + auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy); // Get the value required for atomic operation - auto *LaneValue = + V = B.CreateBitCast(V, IntNTy); + Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + LaneValue = B.CreateBitCast(LaneValue, Ty); // Perform writelane if intermediate scan results are required later in the // kernel computations Value *OldValue = nullptr; if (NeedResult) { - OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, - {Accumulator, LaneIdxInt, OldValuePhi}); + OldValue = + B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, + {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt, + B.CreateBitCast(OldValuePhi, IntNTy)}); + OldValue = B.CreateBitCast(OldValue, Ty); OldValuePhi->addIncoming(OldValue, ComputeLoop); } @@ -590,8 +634,10 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( return {OldValue, NewAccumulator}; } -static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, - unsigned BitWidth) { +static Constant *getIdentityValueForAtomicOp(Type *const Ty, + AtomicRMWInst::BinOp Op) { + LLVMContext &C = Ty->getContext(); + const unsigned BitWidth = Ty->getPrimitiveSizeInBits(); switch (Op) { default: llvm_unreachable("Unhandled atomic op"); @@ -600,14 +646,22 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, case AtomicRMWInst::Or: case AtomicRMWInst::Xor: case AtomicRMWInst::UMax: - return APInt::getMinValue(BitWidth); + return ConstantInt::get(C, APInt::getMinValue(BitWidth)); case AtomicRMWInst::And: case AtomicRMWInst::UMin: - return APInt::getMaxValue(BitWidth); + return ConstantInt::get(C, APInt::getMaxValue(BitWidth)); case AtomicRMWInst::Max: - return APInt::getSignedMinValue(BitWidth); + return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth)); case AtomicRMWInst::Min: - return APInt::getSignedMaxValue(BitWidth); + return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth)); + case AtomicRMWInst::FAdd: + return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true)); + case AtomicRMWInst::FSub: + return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false)); + case AtomicRMWInst::FMin: + return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), false)); + case AtomicRMWInst::FMax: + return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), true)); } } @@ -623,6 +677,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // Start building just before the instruction. IRBuilder<> B(&I); + if (AtomicRMWInst::isFPOperation(Op)) { + B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP)); + } + // If we are in a pixel shader, because of how we have to mask out helper // lane invocations, we need to record the entry and exit BB's. BasicBlock *PixelEntryBB = nullptr; @@ -649,12 +707,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, } Type *const Ty = I.getType(); + Type *Int32Ty = B.getInt32Ty(); + Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits()); + bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); - auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); + auto *const VecTy = FixedVectorType::get(Int32Ty, 2); // This is the value in the atomic operation we need to combine in order to // reduce the number of atomic operations. - Value *const V = I.getOperand(ValIdx); + Value *V = I.getOperand(ValIdx); // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. @@ -671,39 +732,47 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {Ballot, B.getInt32(0)}); } else { - Value *const BitCast = B.CreateBitCast(Ballot, VecTy); - Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); - Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); + Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); } - Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); - Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + + // For atomic sub, perform scan with add operation and allow one lane to + // subtract the reduced value later. + AtomicRMWInst::BinOp ScanOp = Op; + if (Op == AtomicRMWInst::Sub) { + ScanOp = AtomicRMWInst::Add; + } else if (Op == AtomicRMWInst::FSub) { + ScanOp = AtomicRMWInst::FAdd; + } + Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp); Value *ExclScan = nullptr; Value *NewV = nullptr; const bool NeedResult = !I.use_empty(); - Function *F = I.getFunction(); - LLVMContext &C = F->getContext(); BasicBlock *ComputeLoop = nullptr; BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - const AtomicRMWInst::BinOp ScanOp = - Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; if (ScanImpl == ScanOptions::DPP) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - NewV = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = - Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + V = B.CreateBitCast(V, IntNTy); + Identity = B.CreateBitCast(Identity, IntNTy); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy, + {V, Identity}); + NewV = B.CreateBitCast(NewV, Ty); + V = B.CreateBitCast(V, Ty); + Identity = B.CreateBitCast(Identity, Ty); if (!NeedResult && ST->hasPermLaneX16()) { // On GFX10 the permlanex16 instruction helps us build a reduction // without too many readlanes and writelanes, which are generally bad @@ -718,8 +787,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // which we will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); assert(TyBitWidth == 32); + NewV = B.CreateBitCast(NewV, IntNTy); NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); + NewV = B.CreateBitCast(NewV, Ty); } // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); @@ -746,13 +817,22 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, NewV = buildMul(B, V, Ctpop); break; } - + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + Value *const Ctpop = B.CreateIntCast( + B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false); + Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); + NewV = B.CreateFMul(V, CtpopFP); + break; + } case AtomicRMWInst::And: case AtomicRMWInst::Or: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. NewV = V; @@ -771,7 +851,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // We only want a single lane to enter our new control flow, and we do this // by checking if there are any active lanes below us. Only one lane will // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0)); // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -840,9 +920,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, Value *BroadcastI = nullptr; if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); + Value *const ExtractLo = B.CreateTrunc(PHI, Int32Ty); + Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(PHI, 32), Int32Ty); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); CallInst *const ReadFirstLaneHi = @@ -853,8 +932,11 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { + Value *CastedPhi = B.CreateBitCast(PHI, IntNTy); + BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi); + BroadcastI = B.CreateBitCast(BroadcastI, Ty); - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -874,6 +956,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, llvm_unreachable("Atomic Optimzer is disabled for None strategy"); } } else { + Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty) + : B.CreateIntCast(Mbcnt, Ty, false); switch (Op) { default: llvm_unreachable("Unhandled atomic op"); @@ -887,18 +971,25 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: LaneOffset = B.CreateSelect(Cond, Identity, V); break; case AtomicRMWInst::Xor: LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); break; + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + LaneOffset = B.CreateFMul(V, Mbcnt); + break; + } } } Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); if (IsPixelShader) { // Need a final PHI to reconverge to above the helper lane branch mask. - B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); + B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt()); PHINode *const PHI = B.CreatePHI(Ty, 2); PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 57c873f00a4a..5fd9e571282d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -28,6 +28,10 @@ void initializeCycleInfoWrapperPassPass(PassRegistry &); using namespace llvm; +static cl::opt<unsigned> KernargPreloadCount( + "amdgpu-kernarg-preload-count", + cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); + #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, enum ImplicitArgumentPositions { @@ -914,9 +918,68 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); } -class AMDGPUAttributor : public ModulePass { +static void addPreloadKernArgHint(Function &F, TargetMachine &TM) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + for (unsigned I = 0; + I < F.arg_size() && + I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs()); + ++I) { + Argument &Arg = *F.getArg(I); + // Check for incompatible attributes. + if (Arg.hasByRefAttr() || Arg.hasNestAttr()) + break; + + Arg.addAttr(Attribute::InReg); + } +} + +static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) { + SetVector<Function *> Functions; + for (Function &F : M) { + if (!F.isIntrinsic()) + Functions.insert(&F); + } + + CallGraphUpdater CGUpdater; + BumpPtrAllocator Allocator; + AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM); + DenseSet<const char *> Allowed( + {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, + &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, + &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID, + &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID}); + + AttributorConfig AC(CGUpdater); + AC.Allowed = &Allowed; + AC.IsModulePass = true; + AC.DefaultInitializeLiveInternals = false; + AC.IPOAmendableCB = [](const Function &F) { + return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + }; + + Attributor A(Functions, InfoCache, AC); + + for (Function &F : M) { + if (!F.isIntrinsic()) { + A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); + A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); + CallingConv::ID CC = F.getCallingConv(); + if (!AMDGPU::isEntryFunctionCC(CC)) { + A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); + A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F)); + } else if (CC == CallingConv::AMDGPU_KERNEL) { + addPreloadKernArgHint(F, TM); + } + } + } + + ChangeStatus Change = A.run(); + return Change == ChangeStatus::CHANGED; +} + +class AMDGPUAttributorLegacy : public ModulePass { public: - AMDGPUAttributor() : ModulePass(ID) {} + AMDGPUAttributorLegacy() : ModulePass(ID) {} /// doInitialization - Virtual method overridden by subclasses to do /// any necessary initialization before any pass is run. @@ -930,45 +993,8 @@ public: } bool runOnModule(Module &M) override { - SetVector<Function *> Functions; AnalysisGetter AG(this); - for (Function &F : M) { - if (!F.isIntrinsic()) - Functions.insert(&F); - } - - CallGraphUpdater CGUpdater; - BumpPtrAllocator Allocator; - AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); - DenseSet<const char *> Allowed( - {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, - &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, - &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID, - &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID}); - - AttributorConfig AC(CGUpdater); - AC.Allowed = &Allowed; - AC.IsModulePass = true; - AC.DefaultInitializeLiveInternals = false; - AC.IPOAmendableCB = [](const Function &F) { - return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; - }; - - Attributor A(Functions, InfoCache, AC); - - for (Function &F : M) { - if (!F.isIntrinsic()) { - A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F)); - A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); - if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { - A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); - A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F)); - } - } - } - - ChangeStatus Change = A.run(); - return Change == ChangeStatus::CHANGED; + return runImpl(M, AG, *TM); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -981,11 +1007,25 @@ public: }; } // namespace -char AMDGPUAttributor::ID = 0; +PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M, + ModuleAnalysisManager &AM) { + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + AnalysisGetter AG(FAM); + + // TODO: Probably preserves CFG + return runImpl(M, AG, TM) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +char AMDGPUAttributorLegacy::ID = 0; -Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); } -INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, - false) +Pass *llvm::createAMDGPUAttributorLegacyPass() { + return new AMDGPUAttributorLegacy(); +} +INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor", + false, false) INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass); -INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, - false) +INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor", + false, false) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 9ba5ea8fb73f..cf2896f80f19 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -32,7 +32,7 @@ namespace { /// Wrapper around extendRegister to ensure we extend to a full 32-bit register. static Register extendRegisterMin32(CallLowering::ValueHandler &Handler, - Register ValVReg, CCValAssign &VA) { + Register ValVReg, const CCValAssign &VA) { if (VA.getLocVT().getSizeInBits() < 32) { // 16-bit types are reported as legal for 32-bit registers. We need to // extend and do a 32-bit copy to avoid the verifier complaining about it. @@ -56,12 +56,13 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { } void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, - MachinePointerInfo &MPO, CCValAssign &VA) override { + const MachinePointerInfo &MPO, + const CCValAssign &VA) override { llvm_unreachable("not implemented"); } void assignValueToReg(Register ValVReg, Register PhysReg, - CCValAssign VA) override { + const CCValAssign &VA) override { Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); // If this is a scalar return, insert a readfirstlane just in case the value @@ -82,9 +83,10 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0); } - auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, - {MRI.getType(ExtReg)}, false) - .addReg(ExtReg); + auto ToSGPR = MIRBuilder + .buildIntrinsic(Intrinsic::amdgcn_readfirstlane, + {MRI.getType(ExtReg)}) + .addReg(ExtReg); ExtReg = ToSGPR.getReg(0); } @@ -116,7 +118,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { } void assignValueToReg(Register ValVReg, Register PhysReg, - CCValAssign VA) override { + const CCValAssign &VA) override { markPhysRegUsed(PhysReg); if (VA.getLocVT().getSizeInBits() < 32) { @@ -136,7 +138,8 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { } void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, - MachinePointerInfo &MPO, CCValAssign &VA) override { + const MachinePointerInfo &MPO, + const CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); auto MMO = MF.getMachineMemOperand( @@ -228,14 +231,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { } void assignValueToReg(Register ValVReg, Register PhysReg, - CCValAssign VA) override { + const CCValAssign &VA) override { MIB.addUse(PhysReg, RegState::Implicit); Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); MIRBuilder.buildCopy(PhysReg, ExtReg); } void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, - MachinePointerInfo &MPO, CCValAssign &VA) override { + const MachinePointerInfo &MPO, + const CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); uint64_t LocMemOffset = VA.getLocMemOffset(); const auto &ST = MF.getSubtarget<GCNSubtarget>(); @@ -248,7 +252,8 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { void assignValueToAddress(const CallLowering::ArgInfo &Arg, unsigned ValRegIndex, Register Addr, LLT MemTy, - MachinePointerInfo &MPO, CCValAssign &VA) override { + const MachinePointerInfo &MPO, + const CCValAssign &VA) override { Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt ? extendRegister(Arg.Regs[ValRegIndex], VA) : Arg.Regs[ValRegIndex]; @@ -454,27 +459,28 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info.hasPrivateSegmentBuffer()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register InputPtrReg = Info.addKernargSegmentPtr(TRI); const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -485,13 +491,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(InputPtrReg); } - if (Info.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit()) { + if (UserSGPRInfo.hasFlatScratchInit()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); @@ -596,15 +602,16 @@ bool AMDGPUCallLowering::lowerFormalArguments( SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); - if (Info->hasImplicitBufferPtr()) { + if (UserSGPRInfo.hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: This probably isn't defined for mesa - if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { + if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); @@ -954,12 +961,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { } static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, - bool IsTailCall, CallingConv::ID CC) { - assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, " - "because the address can be divergent"); + bool IsTailCall, bool isWave32, + CallingConv::ID CC) { + // For calls to amdgpu_cs_chain functions, the address is known to be uniform. + assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) && + "Indirect calls can't be tail calls, " + "because the address can be divergent"); if (!IsTailCall) return AMDGPU::G_SI_CALL; + if (AMDGPU::isChainCC(CC)) + return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64; + return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX : AMDGPU::SI_TCRETURN; } @@ -1147,14 +1160,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization( void AMDGPUCallLowering::handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo, + CallingConv::ID CalleeCC, ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const { if (!ST.enableFlatScratch()) { // Insert copies for the SRD. In the HSA case, this should be an identity // copy. auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32), FuncInfo.getScratchRSrcReg()); - MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + + auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC) + ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + + MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg); + CallInst.addReg(CalleeRSrcReg, RegState::Implicit); } for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { @@ -1186,7 +1205,8 @@ bool AMDGPUCallLowering::lowerTailCall( if (!IsSibCall) CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP); - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC); + unsigned Opc = + getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); if (!addCallTargetOperands(MIB, MIRBuilder, Info)) return false; @@ -1195,8 +1215,27 @@ bool AMDGPUCallLowering::lowerTailCall( // be 0. MIB.addImm(0); - // Tell the call which registers are clobbered. + // If this is a chain call, we need to pass in the EXEC mask. const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (AMDGPU::isChainCC(Info.CallConv)) { + ArgInfo ExecArg = Info.OrigArgs[1]; + assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC"); + + if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) + return false; + + if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) { + MIB.addImm(CI->getSExtValue()); + } else { + MIB.addReg(ExecArg.Regs[0]); + unsigned Idx = MIB->getNumOperands() - 1; + MIB->getOperand(Idx).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB, + MIB->getDesc(), MIB->getOperand(Idx), Idx)); + } + } + + // Tell the call which registers are clobbered. const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); MIB.addRegMask(Mask); @@ -1250,7 +1289,8 @@ bool AMDGPUCallLowering::lowerTailCall( // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx && + !AMDGPU::isChainCC(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1266,7 +1306,8 @@ bool AMDGPUCallLowering::lowerTailCall( if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) return false; - handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC, + ImplicitArgRegs); // If we have -tailcallopt, we need to adjust the stack. We'll do the call // sequence start and end here. @@ -1300,8 +1341,62 @@ bool AMDGPUCallLowering::lowerTailCall( return true; } +/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic. +bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + ArgInfo Callee = Info.OrigArgs[0]; + ArgInfo SGPRArgs = Info.OrigArgs[2]; + ArgInfo VGPRArgs = Info.OrigArgs[3]; + ArgInfo Flags = Info.OrigArgs[4]; + + assert(cast<ConstantInt>(Flags.OrigValue)->isZero() && + "Non-zero flags aren't supported yet."); + assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet."); + + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + // The function to jump to is actually the first argument, so we'll change the + // Callee and other info to match that before using our existing helper. + const Value *CalleeV = Callee.OrigValue->stripPointerCasts(); + if (const Function *F = dyn_cast<Function>(CalleeV)) { + Info.Callee = MachineOperand::CreateGA(F, 0); + Info.CallConv = F->getCallingConv(); + } else { + assert(Callee.Regs.size() == 1 && "Too many regs for the callee"); + Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false); + Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve + // behaves the same here. + } + + // The function that we're calling cannot be vararg (only the intrinsic is). + Info.IsVarArg = false; + + assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(), + [](ISD::ArgFlagsTy F) { return F.isInReg(); }) && + "SGPR arguments should be marked inreg"); + assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(), + [](ISD::ArgFlagsTy F) { return F.isInReg(); }) && + "VGPR arguments should not be marked inreg"); + + SmallVector<ArgInfo, 8> OutArgs; + splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv); + splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv); + + Info.IsMustTailCall = true; + return lowerTailCall(MIRBuilder, Info, OutArgs); +} + bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { + if (Function *F = Info.CB->getCalledFunction()) + if (F->isIntrinsic()) { + assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain && + "Unexpected intrinsic"); + return lowerChainCall(MIRBuilder, Info); + } + if (Info.IsVarArg) { LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); return false; @@ -1350,11 +1445,15 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(), + Info.CallConv); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.addDef(TRI->getReturnAddressReg(MF)); + if (!Info.IsConvergent) + MIB.setMIFlag(MachineInstr::NoConvergent); + if (!addCallTargetOperands(MIB, MIRBuilder, Info)) return false; @@ -1389,7 +1488,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv, + ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getStackSize(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 569c6d75204d..a6e801f2a547 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -75,10 +75,13 @@ public: void handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, + CallingConv::ID CalleeCC, ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const; bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl<ArgInfo> &OutArgs) const; + bool lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 2b70665ab95c..9036b26a6f6b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -176,6 +176,10 @@ def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs< (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs) >; +def CSR_AMDGPU_CS_ChainPreserve : CalleeSavedRegs< + (sequence "VGPR%u", 8, 255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions @@ -183,6 +187,11 @@ def CC_AMDGPU_Func : CallingConv<[ CCIfByVal<CCPassByVal<4, 4>>, CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, + + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29 + >>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, @@ -213,6 +222,16 @@ def CC_AMDGPU : CallingConv<[ CCDelegateTo<CC_AMDGPU_Func>> ]>; +def CC_AMDGPU_CS_CHAIN : CallingConv<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + !foreach(i, !range(105), !cast<Register>("SGPR"#i)) + >>>, + + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i)) + >>> +]>; + // Trivial class to denote when a def is used only to get a RegMask, i.e. // SaveList is ignored and the def is not used as part of any calling // convention. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 4ec85f3c5588..87b1957c799e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -52,17 +52,17 @@ static cl::opt<bool> Widen16BitOps( cl::init(true)); static cl::opt<bool> - ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis", - cl::desc("Break large PHI nodes for DAGISel"), - cl::ReallyHidden, cl::init(true)); + BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", + cl::desc("Break large PHI nodes for DAGISel"), + cl::ReallyHidden, cl::init(true)); static cl::opt<bool> - ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis", - cl::desc("For testing purposes, always break large " - "PHIs even if it isn't profitable."), - cl::ReallyHidden, cl::init(false)); + ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis", + cl::desc("For testing purposes, always break large " + "PHIs even if it isn't profitable."), + cl::ReallyHidden, cl::init(false)); -static cl::opt<unsigned> ScalarizeLargePHIsThreshold( +static cl::opt<unsigned> BreakLargePHIsThreshold( "amdgpu-codegenprepare-break-large-phis-threshold", cl::desc("Minimum type size in bits for breaking large PHI nodes"), cl::ReallyHidden, cl::init(32)); @@ -108,9 +108,31 @@ public: bool HasUnsafeFPMath = false; bool HasFP32DenormalFlush = false; bool FlowChanged = false; + mutable Function *SqrtF32 = nullptr; + mutable Function *LdexpF32 = nullptr; DenseMap<const PHINode *, bool> BreakPhiNodesCache; + Function *getSqrtF32() const { + if (SqrtF32) + return SqrtF32; + + LLVMContext &Ctx = Mod->getContext(); + SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt, + {Type::getFloatTy(Ctx)}); + return SqrtF32; + } + + Function *getLdexpF32() const { + if (LdexpF32) + return LdexpF32; + + LLVMContext &Ctx = Mod->getContext(); + LdexpF32 = Intrinsic::getDeclaration( + Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)}); + return LdexpF32; + } + bool canBreakPHINode(const PHINode &I); /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to @@ -276,6 +298,8 @@ public: bool IsNegative) const; Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, FastMathFlags FMF) const; + Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, + FastMathFlags FMF) const; public: bool visitFDiv(BinaryOperator &I); @@ -290,6 +314,7 @@ public: bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool visitMinNum(IntrinsicInst &I); + bool visitSqrt(IntrinsicInst &I); bool run(Function &F); }; @@ -319,6 +344,7 @@ public: } // end anonymous namespace bool AMDGPUCodeGenPrepareImpl::run(Function &F) { + BreakPhiNodesCache.clear(); bool MadeChange = false; Function::iterator NextBB; @@ -598,34 +624,6 @@ static Value *insertValues(IRBuilder<> &Builder, return NewVal; } -// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and -// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the -// width of the original destination. -static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, - unsigned Size, unsigned NumBits, bool IsSigned) { - if (Size <= 32 || NumBits <= 32) { - Intrinsic::ID ID = - IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; - return Builder.CreateIntrinsic(ID, {}, {LHS, RHS}); - } - - assert(NumBits <= 48); - - Intrinsic::ID LoID = - IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; - Intrinsic::ID HiID = - IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24; - - Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS}); - Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS}); - - IntegerType *I64Ty = Builder.getInt64Ty(); - Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty); - Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty); - - return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); -} - bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { if (I.getOpcode() != Instruction::Mul) return false; @@ -665,26 +663,20 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { extractValues(Builder, RHSVals, RHS); IntegerType *I32Ty = Builder.getInt32Ty(); - for (int I = 0, E = LHSVals.size(); I != E; ++I) { - Value *LHS, *RHS; - if (IsSigned) { - LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); - RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); - } else { - LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); - RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); - } - - Value *Result = - getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned); + IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty; + Type *DstTy = LHSVals[0]->getType(); - if (IsSigned) { - ResultVals.push_back( - Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType())); - } else { - ResultVals.push_back( - Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); - } + for (int I = 0, E = LHSVals.size(); I != E; ++I) { + Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty) + : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); + Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty) + : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); + Intrinsic::ID ID = + IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; + Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS}); + Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy) + : Builder.CreateZExtOrTrunc(Result, DstTy); + ResultVals.push_back(Result); } Value *NewVal = insertValues(Builder, Ty, ResultVals); @@ -809,14 +801,10 @@ Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder, // range won't underflow to denormal. The hard part is knowing the // result. We need a range check, the result could be denormal for // 0x1p+126 < den <= 0x1p+127. - - Type *Ty = Src->getType(); - auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); Value *ScaleFactor = Builder.CreateNeg(FrexpExp); Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); - return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, - {Rcp, ScaleFactor}); + return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor}); } /// Emit a 2ulp expansion for fdiv by using frexp for input scaling. @@ -832,8 +820,6 @@ Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, // We're scaling the LHS to avoid a denormal input, and scale the denominator // to avoid large values underflowing the result. - Type *Ty = LHS->getType(); - auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); Value *Rcp = @@ -845,8 +831,30 @@ Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the // result. Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); - return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, - {Mul, ExpDiff}); + return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff}); +} + +/// Emit a sqrt that handles denormals and is accurate to 2ulp. +Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder, + Value *Src, + FastMathFlags FMF) const { + Type *Ty = Src->getType(); + APFloat SmallestNormal = + APFloat::getSmallestNormalized(Ty->getFltSemantics()); + Value *NeedScale = + Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); + + ConstantInt *Zero = Builder.getInt32(0); + Value *InputScaleFactor = + Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero); + + Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor}); + + Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled); + + Value *OutputScaleFactor = + Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero); + return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor}); } /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. @@ -890,8 +898,8 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( - IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, - FastMathFlags SqrtFMF, const Instruction *CtxI) const { + IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF, + const FastMathFlags SqrtFMF, const Instruction *CtxI) const { // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. assert(DivFMF.allowContract() && SqrtFMF.allowContract()); @@ -910,10 +918,9 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { // Add in the sqrt flags. IRBuilder<>::FastMathFlagGuard Guard(Builder); - DivFMF |= SqrtFMF; - Builder.setFastMathFlags(DivFMF); + Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -1077,6 +1084,21 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { const FastMathFlags DivFMF = FPOp->getFastMathFlags(); const float ReqdAccuracy = FPOp->getFPAccuracy(); + FastMathFlags SqrtFMF; + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *RsqOp = nullptr; + auto *DenII = dyn_cast<IntrinsicInst>(Den); + if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && + DenII->hasOneUse()) { + const auto *SqrtOp = cast<FPMathOperator>(DenII); + SqrtFMF = SqrtOp->getFastMathFlags(); + if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) + RsqOp = SqrtOp->getOperand(0); + } + // Inaccurate rcp is allowed with unsafe-fp-math or afn. // // Defer to codegen to handle this. @@ -1087,28 +1109,13 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); - if (AllowInaccurateRcp) + if (!RsqOp && AllowInaccurateRcp) return false; // Defer the correct implementations to codegen. if (ReqdAccuracy < 1.0f) return false; - FastMathFlags SqrtFMF; - - Value *Num = FDiv.getOperand(0); - Value *Den = FDiv.getOperand(1); - - Value *RsqOp = nullptr; - auto *DenII = dyn_cast<IntrinsicInst>(Den); - if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && - DenII->hasOneUse()) { - const auto *SqrtOp = cast<FPMathOperator>(DenII); - SqrtFMF = SqrtOp->getFastMathFlags(); - if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) - RsqOp = SqrtOp->getOperand(0); - } - IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(DivFMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); @@ -1777,47 +1784,79 @@ static bool isInterestingPHIIncomingValue(const Value *V) { return false; } +static void collectPHINodes(const PHINode &I, + SmallPtrSet<const PHINode *, 8> &SeenPHIs) { + const auto [It, Inserted] = SeenPHIs.insert(&I); + if (!Inserted) + return; + + for (const Value *Inc : I.incoming_values()) { + if (const auto *PhiInc = dyn_cast<PHINode>(Inc)) + collectPHINodes(*PhiInc, SeenPHIs); + } + + for (const User *U : I.users()) { + if (const auto *PhiU = dyn_cast<PHINode>(U)) + collectPHINodes(*PhiU, SeenPHIs); + } +} + bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) { - // Check in the cache, or add an entry for this node. - // - // We init with false because we consider all PHI nodes unbreakable until we - // reach a conclusion. Doing the opposite - assuming they're break-able until - // proven otherwise - can be harmful in some pathological cases so we're - // conservative for now. - const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false}); - if (!DidInsert) + // Check in the cache first. + if (const auto It = BreakPhiNodesCache.find(&I); + It != BreakPhiNodesCache.end()) return It->second; - // This function may recurse, so to guard against infinite looping, this PHI - // is conservatively considered unbreakable until we reach a conclusion. + // We consider PHI nodes as part of "chains", so given a PHI node I, we + // recursively consider all its users and incoming values that are also PHI + // nodes. We then make a decision about all of those PHIs at once. Either they + // all get broken up, or none of them do. That way, we avoid cases where a + // single PHI is/is not broken and we end up reforming/exploding a vector + // multiple times, or even worse, doing it in a loop. + SmallPtrSet<const PHINode *, 8> WorkList; + collectPHINodes(I, WorkList); + +#ifndef NDEBUG + // Check that none of the PHI nodes in the worklist are in the map. If some of + // them are, it means we're not good enough at collecting related PHIs. + for (const PHINode *WLP : WorkList) { + assert(BreakPhiNodesCache.count(WLP) == 0); + } +#endif - // Don't break PHIs that have no interesting incoming values. That is, where - // there is no clear opportunity to fold the "extractelement" instructions we - // would add. + // To consider a PHI profitable to break, we need to see some interesting + // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist + // must have one to consider all PHIs breakable. // - // Note: IC does not run after this pass, so we're only interested in the - // foldings that the DAG combiner can do. - if (none_of(I.incoming_values(), - [&](Value *V) { return isInterestingPHIIncomingValue(V); })) - return false; - - // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI - // node as user, we don't want to break this PHI either because it's unlikely - // to be beneficial. We would just explode the vector and reassemble it - // directly, wasting instructions. + // This threshold has been determined through performance testing. + // + // Note that the computation below is equivalent to + // + // (unsigned)ceil((K / 3.0) * 2) // - // In the case where multiple users are PHI nodes, we want at least half of - // them to be breakable. - int Score = 0; - for (const Value *U : I.users()) { - if (const auto *PU = dyn_cast<PHINode>(U)) - Score += canBreakPHINode(*PU) ? 1 : -1; + // It's simply written this way to avoid mixing integral/FP arithmetic. + const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3); + unsigned NumBreakablePHIs = 0; + bool CanBreak = false; + for (const PHINode *Cur : WorkList) { + // Don't break PHIs that have no interesting incoming values. That is, where + // there is no clear opportunity to fold the "extractelement" instructions + // we would add. + // + // Note: IC does not run after this pass, so we're only interested in the + // foldings that the DAG combiner can do. + if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) { + if (++NumBreakablePHIs >= Threshold) { + CanBreak = true; + break; + } + } } - if (Score < 0) - return false; + for (const PHINode *Cur : WorkList) + BreakPhiNodesCache[Cur] = CanBreak; - return BreakPhiNodesCache[&I] = true; + return CanBreak; } /// Helper class for "break large PHIs" (visitPHINode). @@ -1898,14 +1937,15 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { // operations with most elements being "undef". This inhibits a lot of // optimization opportunities and can result in unreasonably high register // pressure and the inevitable stack spilling. - if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) + if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) return false; FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType()); - if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) + if (!FVT || FVT->getNumElements() == 1 || + DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold) return false; - if (!ForceScalarizeLargePHIs && !canBreakPHINode(I)) + if (!ForceBreakLargePHIs && !canBreakPHINode(I)) return false; std::vector<VectorSlice> Slices; @@ -1930,8 +1970,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { Slices.emplace_back(EltTy, Idx, 1); } - if (Slices.size() == 1) - return false; + assert(Slices.size() > 1); // Create one PHI per vector piece. The "VectorSlice" class takes care of // creating the necessary instruction to extract the relevant slices of each @@ -1977,6 +2016,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitBitreverseIntrinsicInst(I); case Intrinsic::minnum: return visitMinNum(I); + case Intrinsic::sqrt: + return visitSqrt(I); default: return false; } @@ -2070,9 +2111,75 @@ bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) { return true; } +static bool isOneOrNegOne(const Value *Val) { + const APFloat *C; + return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; +} + +// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. +bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { + Type *Ty = Sqrt.getType()->getScalarType(); + if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts())) + return false; + + const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt); + FastMathFlags SqrtFMF = FPOp->getFastMathFlags(); + + // We're trying to handle the fast-but-not-that-fast case only. The lowering + // of fast llvm.sqrt will give the raw instruction anyway. + if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + return false; + + const float ReqdAccuracy = FPOp->getFPAccuracy(); + + // Defer correctly rounded expansion to codegen. + if (ReqdAccuracy < 1.0f) + return false; + + // FIXME: This is an ugly hack for this pass using forward iteration instead + // of reverse. If it worked like a normal combiner, the rsq would form before + // we saw a sqrt call. + auto *FDiv = + dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser()); + if (FDiv && FDiv->getOpcode() == Instruction::FDiv && + FDiv->getFPAccuracy() >= 1.0f && + canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && + // TODO: We should also handle the arcp case for the fdiv with non-1 value + isOneOrNegOne(FDiv->getOperand(0))) + return false; + + Value *SrcVal = Sqrt.getOperand(0); + bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); + + // The raw instruction is 1 ulp, but the correction for denormal handling + // brings it to 2. + if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f) + return false; + + IRBuilder<> Builder(&Sqrt); + SmallVector<Value *, 4> SrcVals; + extractValues(Builder, SrcVals, SrcVal); + + SmallVector<Value *, 4> ResultVals(SrcVals.size()); + for (int I = 0, E = SrcVals.size(); I != E; ++I) { + if (CanTreatAsDAZ) + ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]); + else + ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF); + } + + Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); + NewSqrt->takeName(&Sqrt); + Sqrt.replaceAllUsesWith(NewSqrt); + Sqrt.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Impl.Mod = &M; Impl.DL = &Impl.Mod->getDataLayout(); + Impl.SqrtF32 = nullptr; + Impl.LdexpF32 = nullptr; return false; } @@ -2092,7 +2199,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr; Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); - SIModeRegisterDefaults Mode(F); + SIModeRegisterDefaults Mode(F, *Impl.ST); Impl.HasFP32DenormalFlush = Mode.FP32Denormals == DenormalMode::getPreserveSign(); return Impl.run(F); @@ -2109,7 +2216,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F, Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F); Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); - SIModeRegisterDefaults Mode(F); + SIModeRegisterDefaults Mode(F, *Impl.ST); Impl.HasFP32DenormalFlush = Mode.FP32Denormals == DenormalMode::getPreserveSign(); PreservedAnalyses PA = PreservedAnalyses::none(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 892e1eef27a8..8d4cad4c07bc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -139,19 +139,21 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; // Combines which should only apply on VI def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>; -def AMDGPUPreLegalizerCombiner: GICombinerHelper< +def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, clamp_i64_to_i16, foldable_fneg]> { + let CombineAllMethodName = "tryCombineAllImpl"; } -def AMDGPUPostLegalizerCombiner: GICombinerHelper< +def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, sign_extension_in_reg]> { + let CombineAllMethodName = "tryCombineAllImpl"; } -def AMDGPURegBankCombiner : GICombinerHelper< +def AMDGPURegBankCombiner : GICombiner< "AMDGPURegBankCombinerImpl", [unmerge_merge, unmerge_cst, unmerge_undef, zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index 78fdedc0b511..69dc78d33c83 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -9,6 +9,7 @@ #include "AMDGPUCombinerHelper.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" @@ -28,6 +29,8 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) { case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINIMUM: + case AMDGPU::G_FMAXIMUM: case AMDGPU::G_FSIN: case AMDGPU::G_FPEXT: case AMDGPU::G_INTRINSIC_TRUNC: @@ -42,7 +45,7 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) { case AMDGPU::G_AMDGPU_FMAX_LEGACY: return true; case AMDGPU::G_INTRINSIC: { - unsigned IntrinsicID = MI.getIntrinsicID(); + unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: @@ -66,8 +69,7 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) { LLVM_READONLY static bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI) { - return MI.getNumOperands() > - (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) || + return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) || MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64; } @@ -85,14 +87,16 @@ static bool hasSourceMods(const MachineInstr &MI) { case TargetOpcode::INLINEASM: case TargetOpcode::INLINEASM_BR: case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: case AMDGPU::G_BITCAST: case AMDGPU::G_ANYEXT: case AMDGPU::G_BUILD_VECTOR: case AMDGPU::G_BUILD_VECTOR_TRUNC: case AMDGPU::G_PHI: return false; - case AMDGPU::G_INTRINSIC: { - unsigned IntrinsicID = MI.getIntrinsicID(); + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: @@ -172,6 +176,10 @@ static unsigned inverseMinMax(unsigned Opc) { return AMDGPU::G_FMINNUM_IEEE; case AMDGPU::G_FMINNUM_IEEE: return AMDGPU::G_FMAXNUM_IEEE; + case AMDGPU::G_FMAXIMUM: + return AMDGPU::G_FMINIMUM; + case AMDGPU::G_FMINIMUM: + return AMDGPU::G_FMAXIMUM; case AMDGPU::G_AMDGPU_FMAX_LEGACY: return AMDGPU::G_AMDGPU_FMIN_LEGACY; case AMDGPU::G_AMDGPU_FMIN_LEGACY: @@ -205,6 +213,8 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINIMUM: + case AMDGPU::G_FMAXIMUM: case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: // 0 doesn't have a negated inline immediate. @@ -227,8 +237,9 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_AMDGPU_RCP_IFLAG: return true; - case AMDGPU::G_INTRINSIC: { - unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: @@ -301,6 +312,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINIMUM: + case AMDGPU::G_FMAXIMUM: case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: { NegateOperand(MatchInfo->getOperand(1)); @@ -326,8 +339,9 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, case AMDGPU::G_FPTRUNC: NegateOperand(MatchInfo->getOperand(1)); break; - case AMDGPU::G_INTRINSIC: { - unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp index a13447586bd4..3afefcf55d49 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -53,13 +53,22 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { // // extern "C" void * __init_array_start[]; // extern "C" void * __init_array_end[]; +// extern "C" void * __fini_array_start[]; +// extern "C" void * __fini_array_end[]; // // using InitCallback = void(); +// using FiniCallback = void(void); // // void call_init_array_callbacks() { // for (auto start = __init_array_start; start != __init_array_end; ++start) // reinterpret_cast<InitCallback *>(*start)(); // } +// +// void call_fini_array_callbacks() { +// size_t fini_array_size = __fini_array_end - __fini_array_start; +// for (size_t i = fini_array_size; i > 0; --i) +// reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])(); +// } static void createInitOrFiniCalls(Function &F, bool IsCtor) { Module &M = *F.getParent(); LLVMContext &C = M.getContext(); @@ -96,15 +105,37 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) { // for now we just call them with no arguments. auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {}); - IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB); + Value *Start = Begin; + Value *Stop = End; + // The destructor array must be called in reverse order. Get a constant + // expression to the end of the array and iterate backwards instead. + if (!IsCtor) { + Type *Int64Ty = IntegerType::getInt64Ty(C); + auto *EndPtr = IRB.CreatePtrToInt(End, Int64Ty); + auto *BeginPtr = IRB.CreatePtrToInt(Begin, Int64Ty); + auto *ByteSize = IRB.CreateSub(EndPtr, BeginPtr); + auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3)); + auto *Offset = IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1)); + Start = IRB.CreateInBoundsGEP( + ArrayType::get(IRB.getPtrTy(), 0), Begin, + ArrayRef<Value *>({ConstantInt::get(Int64Ty, 0), Offset})); + Stop = Begin; + } + + IRB.CreateCondBr( + IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_NE : ICmpInst::ICMP_UGE, Start, + Stop), + LoopBB, ExitBB); IRB.SetInsertPoint(LoopBB); auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr"); - auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()), + auto *CallBack = IRB.CreateLoad(IRB.getPtrTy(F.getAddressSpace()), CallBackPHI, "callback"); IRB.CreateCall(CallBackTy, CallBack); - auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next"); - auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end"); - CallBackPHI->addIncoming(Begin, &F.getEntryBlock()); + auto *NewCallBack = + IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, IsCtor ? 1 : -1, "next"); + auto *EndCmp = IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_ULT, + NewCallBack, Stop, "end"); + CallBackPHI->addIncoming(Start, &F.getEntryBlock()); CallBackPHI->addIncoming(NewCallBack, LoopBB); IRB.CreateCondBr(EndCmp, ExitBB, LoopBB); IRB.SetInsertPoint(ExitBB); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 37df4f68c265..2b85024a9b40 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -105,6 +105,11 @@ def gi_global_saddr : def gi_mubuf_scratch_offset : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">, GIComplexPatternEquiv<MUBUFScratchOffset>; + +def gi_buf_soffset : + GIComplexOperandMatcher<s32, "selectBUFSOffset">, + GIComplexPatternEquiv<BUFSOffset>; + def gi_mubuf_scratch_offen : GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">, GIComplexPatternEquiv<MUBUFScratchOffen>; @@ -379,3 +384,6 @@ def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">, def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv<frameindex_to_targetframeindex>; + +def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">, + GISDNodeXFormEquiv<FPPow2ToExponentXForm>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp new file mode 100644 index 000000000000..4cd8b1ec1051 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -0,0 +1,68 @@ +//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// GlobalISel pass that selects divergent i1 phis as lane mask phis. +/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. +/// Handles all cases of temporal divergence. +/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass +/// currently depends on LCSSA to insert phis with one incoming. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" + +using namespace llvm; + +namespace { + +class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { +public: + static char ID; + +public: + AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) { + initializeAMDGPUGlobalISelDivergenceLoweringPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU GlobalISel divergence lowering"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, + "AMDGPU GlobalISel divergence lowering", false, false) +INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, + "AMDGPU GlobalISel divergence lowering", false, false) + +char AMDGPUGlobalISelDivergenceLowering::ID = 0; + +char &llvm::AMDGPUGlobalISelDivergenceLoweringID = + AMDGPUGlobalISelDivergenceLowering::ID; + +FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { + return new AMDGPUGlobalISelDivergenceLowering(); +} + +bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( + MachineFunction &MF) { + return false; +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 09930dc9612c..5a756602eb1a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -18,7 +18,7 @@ using namespace MIPatternMatch; std::pair<Register, unsigned> AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, - GISelKnownBits *KnownBits) { + GISelKnownBits *KnownBits, bool CheckNUW) { MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (Def->getOpcode() == TargetOpcode::G_CONSTANT) { unsigned Offset; @@ -33,6 +33,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, int64_t Offset; if (Def->getOpcode() == TargetOpcode::G_ADD) { + // A 32-bit (address + offset) should not cause unsigned 32-bit integer + // wraparound, because s_load instructions perform the addition in 64 bits. + if (CheckNUW && !Def->getFlag(MachineInstr::NoUWrap)) { + assert(MRI.getType(Reg).getScalarSizeInBits() == 32); + return std::pair(Reg, 0); + } // TODO: Handle G_OR used for add case if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset))) return std::pair(Def->getOperand(1).getReg(), Offset); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index ff4edf02a84d..5ee888d9db00 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H -#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/Register.h" #include <utility> @@ -25,7 +24,8 @@ namespace AMDGPU { /// Returns base register and constant offset. std::pair<Register, unsigned> getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, - GISelKnownBits *KnownBits = nullptr); + GISelKnownBits *KnownBits = nullptr, + bool CheckNUW = false); bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index dadc0c92ef8b..b51a876750b5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -49,443 +49,14 @@ namespace AMDGPU { namespace HSAMD { //===----------------------------------------------------------------------===// -// HSAMetadataStreamerV2 -//===----------------------------------------------------------------------===// -void MetadataStreamerYamlV2::dump(StringRef HSAMetadataString) const { - errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; -} - -void MetadataStreamerYamlV2::verify(StringRef HSAMetadataString) const { - errs() << "AMDGPU HSA Metadata Parser Test: "; - - HSAMD::Metadata FromHSAMetadataString; - if (fromString(HSAMetadataString, FromHSAMetadataString)) { - errs() << "FAIL\n"; - return; - } - - std::string ToHSAMetadataString; - if (toString(FromHSAMetadataString, ToHSAMetadataString)) { - errs() << "FAIL\n"; - return; - } - - errs() << (HSAMetadataString == ToHSAMetadataString ? "PASS" : "FAIL") - << '\n'; - if (HSAMetadataString != ToHSAMetadataString) { - errs() << "Original input: " << HSAMetadataString << '\n' - << "Produced output: " << ToHSAMetadataString << '\n'; - } -} - -AccessQualifier -MetadataStreamerYamlV2::getAccessQualifier(StringRef AccQual) const { - if (AccQual.empty()) - return AccessQualifier::Unknown; - - return StringSwitch<AccessQualifier>(AccQual) - .Case("read_only", AccessQualifier::ReadOnly) - .Case("write_only", AccessQualifier::WriteOnly) - .Case("read_write", AccessQualifier::ReadWrite) - .Default(AccessQualifier::Default); -} - -AddressSpaceQualifier -MetadataStreamerYamlV2::getAddressSpaceQualifier(unsigned AddressSpace) const { - switch (AddressSpace) { - case AMDGPUAS::PRIVATE_ADDRESS: - return AddressSpaceQualifier::Private; - case AMDGPUAS::GLOBAL_ADDRESS: - return AddressSpaceQualifier::Global; - case AMDGPUAS::CONSTANT_ADDRESS: - return AddressSpaceQualifier::Constant; - case AMDGPUAS::LOCAL_ADDRESS: - return AddressSpaceQualifier::Local; - case AMDGPUAS::FLAT_ADDRESS: - return AddressSpaceQualifier::Generic; - case AMDGPUAS::REGION_ADDRESS: - return AddressSpaceQualifier::Region; - default: - return AddressSpaceQualifier::Unknown; - } -} - -ValueKind MetadataStreamerYamlV2::getValueKind(Type *Ty, StringRef TypeQual, - StringRef BaseTypeName) const { - if (TypeQual.contains("pipe")) - return ValueKind::Pipe; - - return StringSwitch<ValueKind>(BaseTypeName) - .Case("image1d_t", ValueKind::Image) - .Case("image1d_array_t", ValueKind::Image) - .Case("image1d_buffer_t", ValueKind::Image) - .Case("image2d_t", ValueKind::Image) - .Case("image2d_array_t", ValueKind::Image) - .Case("image2d_array_depth_t", ValueKind::Image) - .Case("image2d_array_msaa_t", ValueKind::Image) - .Case("image2d_array_msaa_depth_t", ValueKind::Image) - .Case("image2d_depth_t", ValueKind::Image) - .Case("image2d_msaa_t", ValueKind::Image) - .Case("image2d_msaa_depth_t", ValueKind::Image) - .Case("image3d_t", ValueKind::Image) - .Case("sampler_t", ValueKind::Sampler) - .Case("queue_t", ValueKind::Queue) - .Default(isa<PointerType>(Ty) ? - (Ty->getPointerAddressSpace() == - AMDGPUAS::LOCAL_ADDRESS ? - ValueKind::DynamicSharedPointer : - ValueKind::GlobalBuffer) : - ValueKind::ByValue); -} - -std::string MetadataStreamerYamlV2::getTypeName(Type *Ty, bool Signed) const { - switch (Ty->getTypeID()) { - case Type::IntegerTyID: { - if (!Signed) - return (Twine('u') + getTypeName(Ty, true)).str(); - - auto BitWidth = Ty->getIntegerBitWidth(); - switch (BitWidth) { - case 8: - return "char"; - case 16: - return "short"; - case 32: - return "int"; - case 64: - return "long"; - default: - return (Twine('i') + Twine(BitWidth)).str(); - } - } - case Type::HalfTyID: - return "half"; - case Type::FloatTyID: - return "float"; - case Type::DoubleTyID: - return "double"; - case Type::FixedVectorTyID: { - auto VecTy = cast<FixedVectorType>(Ty); - auto ElTy = VecTy->getElementType(); - auto NumElements = VecTy->getNumElements(); - return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); - } - default: - return "unknown"; - } -} - -std::vector<uint32_t> -MetadataStreamerYamlV2::getWorkGroupDimensions(MDNode *Node) const { - std::vector<uint32_t> Dims; - if (Node->getNumOperands() != 3) - return Dims; - - for (auto &Op : Node->operands()) - Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue()); - return Dims; -} - -Kernel::CodeProps::Metadata MetadataStreamerYamlV2::getHSACodeProps( - const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - HSAMD::Kernel::CodeProps::Metadata HSACodeProps; - const Function &F = MF.getFunction(); - - assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || - F.getCallingConv() == CallingConv::SPIR_KERNEL); - - Align MaxKernArgAlign; - HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, - MaxKernArgAlign); - HSACodeProps.mKernargSegmentAlign = - std::max(MaxKernArgAlign, Align(4)).value(); - - HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; - HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); - HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; - HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; - HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize(); - HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack; - HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled(); - HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs(); - HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs(); - - return HSACodeProps; -} - -Kernel::DebugProps::Metadata MetadataStreamerYamlV2::getHSADebugProps( - const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { - return HSAMD::Kernel::DebugProps::Metadata(); -} - -void MetadataStreamerYamlV2::emitVersion() { - auto &Version = HSAMetadata.mVersion; - - Version.push_back(VersionMajorV2); - Version.push_back(VersionMinorV2); -} - -void MetadataStreamerYamlV2::emitPrintf(const Module &Mod) { - auto &Printf = HSAMetadata.mPrintf; - - auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); - if (!Node) - return; - - for (auto *Op : Node->operands()) - if (Op->getNumOperands()) - Printf.push_back( - std::string(cast<MDString>(Op->getOperand(0))->getString())); -} - -void MetadataStreamerYamlV2::emitKernelLanguage(const Function &Func) { - auto &Kernel = HSAMetadata.mKernels.back(); - - // TODO: What about other languages? - auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); - if (!Node || !Node->getNumOperands()) - return; - auto Op0 = Node->getOperand(0); - if (Op0->getNumOperands() <= 1) - return; - - Kernel.mLanguage = "OpenCL C"; - Kernel.mLanguageVersion.push_back( - mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()); - Kernel.mLanguageVersion.push_back( - mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()); -} - -void MetadataStreamerYamlV2::emitKernelAttrs(const Function &Func) { - auto &Attrs = HSAMetadata.mKernels.back().mAttrs; - - if (auto Node = Func.getMetadata("reqd_work_group_size")) - Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node); - if (auto Node = Func.getMetadata("work_group_size_hint")) - Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node); - if (auto Node = Func.getMetadata("vec_type_hint")) { - Attrs.mVecTypeHint = getTypeName( - cast<ValueAsMetadata>(Node->getOperand(0))->getType(), - mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()); - } - if (Func.hasFnAttribute("runtime-handle")) { - Attrs.mRuntimeHandle = - Func.getFnAttribute("runtime-handle").getValueAsString().str(); - } -} - -void MetadataStreamerYamlV2::emitKernelArgs(const Function &Func, - const GCNSubtarget &ST) { - for (auto &Arg : Func.args()) - emitKernelArg(Arg); - - emitHiddenKernelArgs(Func, ST); -} - -void MetadataStreamerYamlV2::emitKernelArg(const Argument &Arg) { - auto Func = Arg.getParent(); - auto ArgNo = Arg.getArgNo(); - const MDNode *Node; - - StringRef Name; - Node = Func->getMetadata("kernel_arg_name"); - if (Node && ArgNo < Node->getNumOperands()) - Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); - else if (Arg.hasName()) - Name = Arg.getName(); - - StringRef TypeName; - Node = Func->getMetadata("kernel_arg_type"); - if (Node && ArgNo < Node->getNumOperands()) - TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); - - StringRef BaseTypeName; - Node = Func->getMetadata("kernel_arg_base_type"); - if (Node && ArgNo < Node->getNumOperands()) - BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); - - StringRef AccQual; - if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && - Arg.hasNoAliasAttr()) { - AccQual = "read_only"; - } else { - Node = Func->getMetadata("kernel_arg_access_qual"); - if (Node && ArgNo < Node->getNumOperands()) - AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); - } - - StringRef TypeQual; - Node = Func->getMetadata("kernel_arg_type_qual"); - if (Node && ArgNo < Node->getNumOperands()) - TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); - - const DataLayout &DL = Func->getParent()->getDataLayout(); - - MaybeAlign PointeeAlign; - if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) { - if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // FIXME: Should report this for all address spaces - PointeeAlign = Arg.getParamAlign().valueOrOne(); - } - } - - Type *ArgTy; - Align ArgAlign; - std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); - - emitKernelArg(DL, ArgTy, ArgAlign, - getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name, - TypeName, BaseTypeName, AccQual, TypeQual); -} - -void MetadataStreamerYamlV2::emitKernelArg( - const DataLayout &DL, Type *Ty, Align Alignment, ValueKind ValueKind, - MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, - StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { - HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); - auto &Arg = HSAMetadata.mKernels.back().mArgs.back(); - - Arg.mName = std::string(Name); - Arg.mTypeName = std::string(TypeName); - Arg.mSize = DL.getTypeAllocSize(Ty); - Arg.mAlign = Alignment.value(); - Arg.mValueKind = ValueKind; - Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0; - - if (auto PtrTy = dyn_cast<PointerType>(Ty)) - Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace()); - - Arg.mAccQual = getAccessQualifier(AccQual); - - // TODO: Emit Arg.mActualAccQual. - - SmallVector<StringRef, 1> SplitTypeQuals; - TypeQual.split(SplitTypeQuals, " ", -1, false); - for (StringRef Key : SplitTypeQuals) { - auto P = StringSwitch<bool*>(Key) - .Case("const", &Arg.mIsConst) - .Case("restrict", &Arg.mIsRestrict) - .Case("volatile", &Arg.mIsVolatile) - .Case("pipe", &Arg.mIsPipe) - .Default(nullptr); - if (P) - *P = true; - } -} - -void MetadataStreamerYamlV2::emitHiddenKernelArgs(const Function &Func, - const GCNSubtarget &ST) { - unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); - if (!HiddenArgNumBytes) - return; - - auto &DL = Func.getParent()->getDataLayout(); - auto Int64Ty = Type::getInt64Ty(Func.getContext()); - - if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX); - if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY); - if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ); - - auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), - AMDGPUAS::GLOBAL_ADDRESS); - - if (HiddenArgNumBytes >= 32) { - // We forbid the use of features requiring hostcall when compiling OpenCL - // before code object V5, which makes the mutual exclusion between the - // "printf buffer" and "hostcall buffer" here sound. - if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer); - else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer); - else - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); - } - - // Emit "default queue" and "completion action" arguments if enqueue kernel is - // used, otherwise emit dummy "none" arguments. - if (HiddenArgNumBytes >= 40) { - if (!Func.hasFnAttribute("amdgpu-no-default-queue")) { - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue); - } else { - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); - } - } - - if (HiddenArgNumBytes >= 48) { - if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction); - } else { - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); - } - } - - // Emit the pointer argument for multi-grid object. - if (HiddenArgNumBytes >= 56) { - if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg); - else - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); - } -} - -bool MetadataStreamerYamlV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { - return TargetStreamer.EmitHSAMetadata(getHSAMetadata()); -} - -void MetadataStreamerYamlV2::begin(const Module &Mod, - const IsaInfo::AMDGPUTargetID &TargetID) { - emitVersion(); - emitPrintf(Mod); -} - -void MetadataStreamerYamlV2::end() { - std::string HSAMetadataString; - if (toString(HSAMetadata, HSAMetadataString)) - return; - - if (DumpHSAMetadata) - dump(HSAMetadataString); - if (VerifyHSAMetadata) - verify(HSAMetadataString); -} - -void MetadataStreamerYamlV2::emitKernel(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) { - auto &Func = MF.getFunction(); - if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) - return; - - auto CodeProps = getHSACodeProps(MF, ProgramInfo); - auto DebugProps = getHSADebugProps(MF, ProgramInfo); - - HSAMetadata.mKernels.push_back(Kernel::Metadata()); - auto &Kernel = HSAMetadata.mKernels.back(); - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - Kernel.mName = std::string(Func.getName()); - Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); - emitKernelLanguage(Func); - emitKernelAttrs(Func); - emitKernelArgs(Func, ST); - HSAMetadata.mKernels.back().mCodeProps = CodeProps; - HSAMetadata.mKernels.back().mDebugProps = DebugProps; -} - -//===----------------------------------------------------------------------===// -// HSAMetadataStreamerV3 +// HSAMetadataStreamerV4 //===----------------------------------------------------------------------===// -void MetadataStreamerMsgPackV3::dump(StringRef HSAMetadataString) const { +void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; } -void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const { +void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; msgpack::Document FromHSAMetadataString; @@ -507,7 +78,7 @@ void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const { } std::optional<StringRef> -MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const { +MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const { return StringSwitch<std::optional<StringRef>>(AccQual) .Case("read_only", StringRef("read_only")) .Case("write_only", StringRef("write_only")) @@ -515,7 +86,7 @@ MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const { .Default(std::nullopt); } -std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier( +std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier( unsigned AddressSpace) const { switch (AddressSpace) { case AMDGPUAS::PRIVATE_ADDRESS: @@ -536,7 +107,7 @@ std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier( } StringRef -MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual, +MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const { if (TypeQual.contains("pipe")) return "pipe"; @@ -563,7 +134,7 @@ MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual, : "by_value"); } -std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty, +std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { @@ -602,7 +173,7 @@ std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty, } msgpack::ArrayDocNode -MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const { +MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const { auto Dims = HSAMetadataDoc->getArrayNode(); if (Node->getNumOperands() != 3) return Dims; @@ -613,14 +184,20 @@ MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const { return Dims; } -void MetadataStreamerMsgPackV3::emitVersion() { +void MetadataStreamerMsgPackV4::emitVersion() { auto Version = HSAMetadataDoc->getArrayNode(); - Version.push_back(Version.getDocument()->getNode(VersionMajorV3)); - Version.push_back(Version.getDocument()->getNode(VersionMinorV3)); + Version.push_back(Version.getDocument()->getNode(VersionMajorV4)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV4)); getRootMetadata("amdhsa.version") = Version; } -void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) { +void MetadataStreamerMsgPackV4::emitTargetID( + const IsaInfo::AMDGPUTargetID &TargetID) { + getRootMetadata("amdhsa.target") = + HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true); +} + +void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) { auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); if (!Node) return; @@ -633,7 +210,7 @@ void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) { getRootMetadata("amdhsa.printf") = Printf; } -void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func, +void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern) { // TODO: What about other languages? auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); @@ -652,7 +229,7 @@ void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func, Kern[".language_version"] = LanguageVersion; } -void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func, +void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) { if (auto Node = Func.getMetadata("reqd_work_group_size")) @@ -677,7 +254,7 @@ void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func, Kern[".kind"] = Kern.getDocument()->getNode("fini"); } -void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF, +void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern) { auto &Func = MF.getFunction(); unsigned Offset = 0; @@ -690,7 +267,7 @@ void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF, Kern[".args"] = Args; } -void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg, +void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args) { auto Func = Arg.getParent(); @@ -714,16 +291,20 @@ void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg, if (Node && ArgNo < Node->getNumOperands()) BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); - StringRef AccQual; - if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && - Arg.hasNoAliasAttr()) { - AccQual = "read_only"; - } else { - Node = Func->getMetadata("kernel_arg_access_qual"); - if (Node && ArgNo < Node->getNumOperands()) - AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + StringRef ActAccQual; + // Do we really need NoAlias check here? + if (Arg.getType()->isPointerTy() && Arg.hasNoAliasAttr()) { + if (Arg.onlyReadsMemory()) + ActAccQual = "read_only"; + else if (Arg.hasAttribute(Attribute::WriteOnly)) + ActAccQual = "write_only"; } + StringRef AccQual; + Node = Func->getMetadata("kernel_arg_access_qual"); + if (Node && ArgNo < Node->getNumOperands()) + AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + StringRef TypeQual; Node = Func->getMetadata("kernel_arg_type_qual"); if (Node && ArgNo < Node->getNumOperands()) @@ -747,14 +328,15 @@ void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg, emitKernelArg(DL, ArgTy, ArgAlign, getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, - PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); + PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual, + AccQual, TypeQual); } -void MetadataStreamerMsgPackV3::emitKernelArg( +void MetadataStreamerMsgPackV4::emitKernelArg( const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, - StringRef AccQual, StringRef TypeQual) { + StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) { auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) @@ -780,7 +362,8 @@ void MetadataStreamerMsgPackV3::emitKernelArg( if (auto AQ = getAccessQualifier(AccQual)) Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true); - // TODO: Emit Arg[".actual_access"]. + if (auto AAQ = getAccessQualifier(ActAccQual)) + Arg[".actual_access"] = Arg.getDocument()->getNode(*AAQ, /*Copy=*/true); SmallVector<StringRef, 1> SplitTypeQuals; TypeQual.split(SplitTypeQuals, " ", -1, false); @@ -798,7 +381,7 @@ void MetadataStreamerMsgPackV3::emitKernelArg( Args.push_back(Arg); } -void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( +void MetadataStreamerMsgPackV4::emitHiddenKernelArgs( const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) { auto &Func = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -824,7 +407,7 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( Args); auto Int8PtrTy = - Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); if (HiddenArgNumBytes >= 32) { // We forbid the use of features requiring hostcall when compiling OpenCL @@ -871,9 +454,10 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( } } -msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps( - const MachineFunction &MF, const SIProgramInfo &ProgramInfo, - unsigned CodeObjectVersion) const { +msgpack::MapDocNode +MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo, + unsigned CodeObjectVersion) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); const Function &F = MF.getFunction(); @@ -918,18 +502,19 @@ msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps( return Kern; } -bool MetadataStreamerMsgPackV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { +bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) { return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true); } -void MetadataStreamerMsgPackV3::begin(const Module &Mod, +void MetadataStreamerMsgPackV4::begin(const Module &Mod, const IsaInfo::AMDGPUTargetID &TargetID) { emitVersion(); + emitTargetID(TargetID); emitPrintf(Mod); getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); } -void MetadataStreamerMsgPackV3::end() { +void MetadataStreamerMsgPackV4::end() { std::string HSAMetadataString; raw_string_ostream StrOS(HSAMetadataString); HSAMetadataDoc->toYAML(StrOS); @@ -940,7 +525,7 @@ void MetadataStreamerMsgPackV3::end() { verify(StrOS.str()); } -void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF, +void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL && @@ -966,31 +551,6 @@ void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF, } //===----------------------------------------------------------------------===// -// HSAMetadataStreamerV4 -//===----------------------------------------------------------------------===// - -void MetadataStreamerMsgPackV4::emitVersion() { - auto Version = HSAMetadataDoc->getArrayNode(); - Version.push_back(Version.getDocument()->getNode(VersionMajorV4)); - Version.push_back(Version.getDocument()->getNode(VersionMinorV4)); - getRootMetadata("amdhsa.version") = Version; -} - -void MetadataStreamerMsgPackV4::emitTargetID( - const IsaInfo::AMDGPUTargetID &TargetID) { - getRootMetadata("amdhsa.target") = - HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true); -} - -void MetadataStreamerMsgPackV4::begin(const Module &Mod, - const IsaInfo::AMDGPUTargetID &TargetID) { - emitVersion(); - emitTargetID(TargetID); - emitPrintf(Mod); - getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); -} - -//===----------------------------------------------------------------------===// // HSAMetadataStreamerV5 //===----------------------------------------------------------------------===// @@ -1044,7 +604,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 6; // Reserved. auto Int8PtrTy = - Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); if (M->getNamedMetadata("llvm.printf.fmts")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, @@ -1097,13 +657,13 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 8; // Skipped. } - if (MFI.hasQueuePtr()) + if (MFI.getUserSGPRInfo().hasQueuePtr()) emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); } void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) { - MetadataStreamerMsgPackV3::emitKernelAttrs(Func, Kern); + MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern); if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool()) Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 7d7080e920f5..6d6bd86711b1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -30,7 +30,6 @@ class MDNode; class Module; struct SIProgramInfo; class Type; -class GCNSubtarget; namespace AMDGPU { @@ -62,7 +61,7 @@ protected: msgpack::MapDocNode Kern) = 0; }; -class MetadataStreamerMsgPackV3 : public MetadataStreamer { +class MetadataStreamerMsgPackV4 : public MetadataStreamer { protected: std::unique_ptr<msgpack::Document> HSAMetadataDoc = std::make_unique<msgpack::Document>(); @@ -89,6 +88,8 @@ protected: void emitVersion() override; + void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); + void emitPrintf(const Module &Mod); void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern); @@ -105,8 +106,8 @@ protected: msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign = std::nullopt, StringRef Name = "", StringRef TypeName = "", - StringRef BaseTypeName = "", StringRef AccQual = "", - StringRef TypeQual = ""); + StringRef BaseTypeName = "", StringRef ActAccQual = "", + StringRef AccQual = "", StringRef TypeQual = ""); void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) override; @@ -120,8 +121,8 @@ protected: } public: - MetadataStreamerMsgPackV3() = default; - ~MetadataStreamerMsgPackV3() = default; + MetadataStreamerMsgPackV4() = default; + ~MetadataStreamerMsgPackV4() = default; bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; @@ -134,19 +135,6 @@ public: const SIProgramInfo &ProgramInfo) override; }; -class MetadataStreamerMsgPackV4 : public MetadataStreamerMsgPackV3 { -protected: - void emitVersion() override; - void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); - -public: - MetadataStreamerMsgPackV4() = default; - ~MetadataStreamerMsgPackV4() = default; - - void begin(const Module &Mod, - const IsaInfo::AMDGPUTargetID &TargetID) override; -}; - class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 { protected: void emitVersion() override; @@ -159,82 +147,6 @@ public: ~MetadataStreamerMsgPackV5() = default; }; -// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2. -class MetadataStreamerYamlV2 final : public MetadataStreamer { -private: - Metadata HSAMetadata; - - void dump(StringRef HSAMetadataString) const; - - void verify(StringRef HSAMetadataString) const; - - AccessQualifier getAccessQualifier(StringRef AccQual) const; - - AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const; - - ValueKind getValueKind(Type *Ty, StringRef TypeQual, - StringRef BaseTypeName) const; - - std::string getTypeName(Type *Ty, bool Signed) const; - - std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const; - - Kernel::CodeProps::Metadata getHSACodeProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; - Kernel::DebugProps::Metadata getHSADebugProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; - - void emitPrintf(const Module &Mod); - - void emitKernelLanguage(const Function &Func); - - void emitKernelAttrs(const Function &Func); - - void emitKernelArgs(const Function &Func, const GCNSubtarget &ST); - - void emitKernelArg(const Argument &Arg); - - void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, - ValueKind ValueKind, - MaybeAlign PointeeAlign = std::nullopt, - StringRef Name = "", StringRef TypeName = "", - StringRef BaseTypeName = "", StringRef AccQual = "", - StringRef TypeQual = ""); - - void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST); - - const Metadata &getHSAMetadata() const { - return HSAMetadata; - } - -protected: - void emitVersion() override; - void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, - msgpack::ArrayDocNode Args) override { - llvm_unreachable("Dummy override should not be invoked!"); - } - void emitKernelAttrs(const Function &Func, - msgpack::MapDocNode Kern) override { - llvm_unreachable("Dummy override should not be invoked!"); - } - -public: - MetadataStreamerYamlV2() = default; - ~MetadataStreamerYamlV2() = default; - - bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; - - void begin(const Module &Mod, - const IsaInfo::AMDGPUTargetID &TargetID) override; - - void end() override; - - void emitKernel(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) override; -}; - } // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index ffa6c88f9d41..0a17b1536040 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -345,13 +345,13 @@ class PipelineSolver { // return the number of edges missed. int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); - // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It - // returns the cost (in terms of missed pipeline edges), and tracks the edges - // added in \p AddedEdges + /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It + /// returns the cost (in terms of missed pipeline edges), and tracks the edges + /// added in \p AddedEdges template <typename T> int linkSUnit(SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); - // Remove the edges passed via \p AddedEdges + /// Remove the edges passed via \p AddedEdges void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); // Convert the passed in maps to arrays for bidirectional iterators void convertSyncMapsToArrays(); @@ -593,11 +593,10 @@ void PipelineSolver::populateReadyList( for (; I != E; ++I) { std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } + SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { + return SG.getSGID() == CandSGID; + }); + assert(Match); if (UseCostHeur) { if (Match->isFull()) { @@ -739,11 +738,10 @@ void PipelineSolver::greedyFind( for (; I != E; ++I) { std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } + SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { + return SG.getSGID() == CandSGID; + }); + assert(Match); LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " << (int)Match->getMask() << "\n"); @@ -849,10 +847,11 @@ protected: const SIInstrInfo *TII; public: - // Add SchedGroups to \p Pipeline to implement this Strategy. + /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. virtual void applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0; + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + bool IsReentry) = 0; // Returns true if this strategy should be applied to a ScheduleDAG. virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; @@ -870,7 +869,8 @@ private: public: void applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + bool IsReentry) override; bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } @@ -882,7 +882,8 @@ public: void MFMASmallGemmOpt::applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + bool IsReentry) { // Count the number of MFMA instructions. unsigned MFMACount = 0; for (const MachineInstr &I : *DAG) @@ -964,11 +965,10 @@ private: // Does the VALU have a DS_WRITE successor that is the same as other // VALU already in the group. The V_PERMs will all share 1 DS_W succ - return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) { - return std::any_of(SU->Succs.begin(), SU->Succs.end(), - [&Elt](const SDep &ThisSucc) { - return ThisSucc.getSUnit() == Elt; - }); + return llvm::any_of(*Cache, [&SU](SUnit *Elt) { + return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { + return ThisSucc.getSUnit() == Elt; + }); }); } @@ -1045,8 +1045,8 @@ private: : InstructionRule(TII, SGID, NeedsCache) {} }; - // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup - // that is /p Distance steps away + /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup + /// that is \p Distance steps away class SharesPredWithPrevNthGroup final : public InstructionRule { private: unsigned Distance = 1; @@ -1078,16 +1078,18 @@ private: Cache->push_back(Pred.getSUnit()); } } + + // If the other group has no PERM preds, then this group won't share any + if (!Cache->size()) + return false; } - assert(Cache->size()); auto DAG = SyncPipe[0].DAG; // Does the previous DS_WRITE share a V_PERM predecessor with this // VMEM_READ - return ( - std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) { - return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); - })); + return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { + return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); + }); } SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) @@ -1097,7 +1099,8 @@ private: public: void applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + bool IsReentry) override; bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } @@ -1107,14 +1110,20 @@ public: } }; +static unsigned DSWCount = 0; +static unsigned DSWWithPermCount = 0; +static unsigned DSWWithSharedVMEMCount = 0; + void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + bool IsReentry) { unsigned MFMACount = 0; - unsigned DSWCount = 0; - unsigned DSWWithPermCount = 0; - unsigned DSWWithSharedVMEMCount = 0; unsigned DSRCount = 0; + + assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 && + DSWWithSharedVMEMCount == 0)) && + "DSWCounters should be zero in pre-RA scheduling!"); SmallVector<SUnit *, 6> DSWithPerms; for (auto &SU : DAG->SUnits) { auto I = SU.getInstr(); @@ -1123,7 +1132,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( else if (TII->isDS(*I)) { if (I->mayLoad()) ++DSRCount; - else if (I->mayStore()) { + else if (I->mayStore() && !IsReentry) { ++DSWCount; for (auto Pred : SU.Preds) { if (Pred.getSUnit()->getInstr()->getOpcode() == @@ -1135,57 +1144,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( } } } - DSWWithPermCount = DSWithPerms.size(); - auto I = DSWithPerms.begin(); - auto E = DSWithPerms.end(); - - // Get the count of DS_WRITES with V_PERM predecessors which - // have loop carried dependencies (WAR) on the same VMEM_READs. - // We consider partial overlap as a miss -- in other words, - // for a given DS_W, we only consider another DS_W as matching - // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred - // for every V_PERM pred of this DS_W. - DenseMap<MachineInstr *, SUnit *> VMEMLookup; - SmallVector<SUnit *, 6> Counted; - for (; I != E; I++) { - SUnit *Cand = nullptr; - bool MissedAny = false; - for (auto &Pred : (*I)->Preds) { - if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) - continue; - if (Cand && - std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) - break; - - for (auto &Succ : Pred.getSUnit()->Succs) { - auto MI = Succ.getSUnit()->getInstr(); - if (!TII->isVMEM(*MI) || !MI->mayLoad()) + if (!IsReentry) { + DSWWithPermCount = DSWithPerms.size(); + auto I = DSWithPerms.begin(); + auto E = DSWithPerms.end(); + + // Get the count of DS_WRITES with V_PERM predecessors which + // have loop carried dependencies (WAR) on the same VMEM_READs. + // We consider partial overlap as a miss -- in other words, + // for a given DS_W, we only consider another DS_W as matching + // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred + // for every V_PERM pred of this DS_W. + DenseMap<MachineInstr *, SUnit *> VMEMLookup; + SmallVector<SUnit *, 6> Counted; + for (; I != E; I++) { + SUnit *Cand = nullptr; + bool MissedAny = false; + for (auto &Pred : (*I)->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) continue; - if (MissedAny || !VMEMLookup.size()) { - MissedAny = true; - VMEMLookup[MI] = *I; - continue; - } + if (Cand && llvm::is_contained(Counted, Cand)) + break; - if (!VMEMLookup.contains(MI)) { - MissedAny = true; - VMEMLookup[MI] = *I; - continue; - } + for (auto &Succ : Pred.getSUnit()->Succs) { + auto MI = Succ.getSUnit()->getInstr(); + if (!TII->isVMEM(*MI) || !MI->mayLoad()) + continue; - Cand = VMEMLookup[MI]; - if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) { - MissedAny = true; - break; + if (MissedAny || !VMEMLookup.size()) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; + } + + if (!VMEMLookup.contains(MI)) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; + } + + Cand = VMEMLookup[MI]; + if (llvm::is_contained(Counted, Cand)) { + MissedAny = true; + break; + } } } - } - if (!MissedAny && Cand) { - DSWWithSharedVMEMCount += 2; - Counted.push_back(Cand); - Counted.push_back(*I); + if (!MissedAny && Cand) { + DSWWithSharedVMEMCount += 2; + Counted.push_back(Cand); + Counted.push_back(*I); + } } } @@ -1401,7 +1412,11 @@ public: // first created SchedGroup first. bool IsBottomUp = 1; + // Whether or not this is a reentry into the IGroupLPDAGMutation. + bool IsReentry = false; + IGroupLPDAGMutation() = default; + IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {} }; unsigned SchedGroup::NumSchedGroups = 0; @@ -1689,7 +1704,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { auto S = createIGLPStrategy(StrategyID, DAG, TII); if (S->shouldApplyStrategy(DAG)) { IsBottomUp = S->IsBottomUp; - S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); + S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry); } } @@ -1697,8 +1712,13 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { namespace llvm { -std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { - return std::make_unique<IGroupLPDAGMutation>(); +/// \p IsReentry specifes whether or not this is a reentry into the +/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the +/// same scheduling region (e.g. pre and post-RA scheduling / multiple +/// scheduling "phases"), we can reenter this mutation framework more than once +/// for a given region. +std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) { + return std::make_unique<IGroupLPDAGMutation>(IsReentry); } } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index ae0faba0780d..3ec8be4f8892 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -14,7 +14,7 @@ namespace llvm { -std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(); +std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry); } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 825c6f0acd0f..b0eac567ec9f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600RegisterInfo.h" +#include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" @@ -81,10 +82,9 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) { // same register. static SDValue stripExtractLoElt(SDValue In) { if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { - if (Idx->isZero() && In.getValueSizeInBits() <= 32) - return In.getOperand(0); - } + SDValue Idx = In.getOperand(1); + if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32) + return In.getOperand(0); } if (In.getOpcode() == ISD::TRUNCATE) { @@ -113,12 +113,12 @@ INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", /// This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, - CodeGenOpt::Level OptLevel) { + CodeGenOptLevel OptLevel) { return new AMDGPUDAGToDAGISel(TM, OptLevel); } AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM, - CodeGenOpt::Level OptLevel) + CodeGenOptLevel OptLevel) : SelectionDAGISel(ID, TM, OptLevel) { EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; } @@ -132,7 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { } #endif Subtarget = &MF.getSubtarget<GCNSubtarget>(); - Mode = SIModeRegisterDefaults(MF.getFunction()); + Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -164,6 +164,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { case ISD::FTRUNC: case ISD::FRINT: case ISD::FNEARBYINT: + case ISD::FROUNDEVEN: case ISD::FROUND: case ISD::FFLOOR: case ISD::FMINNUM: @@ -596,11 +597,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; uint64_t Imm; - if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) + if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) { Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); - else { + if (AMDGPU::isValid32BitLiteral(Imm, true)) + break; + } else { ConstantSDNode *C = cast<ConstantSDNode>(N); Imm = C->getZExtValue(); + if (AMDGPU::isValid32BitLiteral(Imm, false)) + break; } SDLoc DL(N); @@ -664,6 +669,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BRCOND: SelectBRCOND(N); return; + case ISD::FP_EXTEND: + SelectFP_EXTEND(N); + return; case AMDGPUISD::CVT_PKRTZ_F16_F32: case AMDGPUISD::CVT_PKNORM_I16_F32: case AMDGPUISD::CVT_PKNORM_U16_F32: @@ -692,6 +700,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectINTRINSIC_VOID(N); return; } + case AMDGPUISD::WAVE_ADDRESS: { + SelectWAVE_ADDRESS(N); + return; + } + case ISD::STACKRESTORE: { + SelectSTACKRESTORE(N); + return; + } } SelectCode(N); @@ -1136,13 +1152,69 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, return CurDAG->SignBitIsZero(Base); } -bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base, - uint64_t FlatVariant) const { - if (FlatVariant != SIInstrFlags::FlatScratch) +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(SDValue Addr) { + return (Addr.getOpcode() == ISD::ADD && + Addr->getFlags().hasNoUnsignedWrap()) || + Addr->getOpcode() == ISD::OR; +} + +// Check that the base address of flat scratch load/store in the form of `base + +// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware +// requirement). We always treat the first operand as the base address here. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { + if (isNoUnsignedWrap(Addr)) return true; - // When value in 32-bit Base can be negative calculate scratch offset using - // 32-bit add instruction, otherwise use Base(unsigned) + offset. - return CurDAG->SignBitIsZero(Base); + + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (AMDGPU::isGFX12Plus(*Subtarget)) + return true; + + auto LHS = Addr.getOperand(0); + auto RHS = Addr.getOperand(1); + + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + ConstantSDNode *ImmOp = nullptr; + if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) { + if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000) + return true; + } + + return CurDAG->SignBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { + if (isNoUnsignedWrap(Addr)) + return true; + + auto LHS = Addr.getOperand(0); + auto RHS = Addr.getOperand(1); + return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR + Imm. +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const { + auto Base = Addr.getOperand(0); + auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1)); + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (isNoUnsignedWrap(Base) && + (isNoUnsignedWrap(Addr) || + (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000))) + return true; + + auto LHS = Base.getOperand(0); + auto RHS = Base.getOperand(1); + return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); } // TODO: If offset is too big, put low 16-bit into offset. @@ -1252,7 +1324,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = Subtarget->hasRestrictedSOffset() + ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); ConstantSDNode *C1 = nullptr; SDValue N0 = Addr; @@ -1307,7 +1381,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, return true; } - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) { // Legal offset for instruction. Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); return true; @@ -1381,7 +1456,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS); // Don't fold null pointer. if (Imm != NullPtr) { - const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); SDValue HighBits = CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32); MachineSDNode *MovHighBits = CurDAG->getMachineNode( @@ -1415,8 +1490,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, // Therefore it should be safe to fold any VGPR offset on gfx9 into the // MUBUF vaddr, but not on older subtargets which can only do this if the // sign bit is known 0. + const SIInstrInfo *TII = Subtarget->getInstrInfo(); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && + if (TII->isLegalMUBUFImmOffset(C1->getZExtValue()) && (!Subtarget->privateMemoryResourceIsRangeChecked() || CurDAG->SignBitIsZero(N0))) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); @@ -1448,6 +1524,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue &Offset) const { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); SDLoc DL(Addr); @@ -1464,14 +1541,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, if (Addr.getOpcode() == ISD::ADD) { // Add (CopyFromReg <sgpr>) <constant> CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); - if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) + if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) return false; if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0))) return false; SOffset = Addr.getOperand(0); } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) && - SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) { + TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) { // <constant> SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); } else { @@ -1488,8 +1565,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; @@ -1510,6 +1586,21 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } +bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode, + SDValue &SOffset) const { + if (Subtarget->hasRestrictedSOffset()) { + if (auto SOffsetConst = dyn_cast<ConstantSDNode>(ByteOffsetNode)) { + if (SOffsetConst->isZero()) { + SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32); + return true; + } + } + } + + SOffset = ByteOffsetNode; + return true; +} + // Find a load or store from corresponding pattern root. // Roots may be build_vector, bitconvert or their combinations. static MemSDNode* findMemSDNode(SDNode *N) { @@ -1539,7 +1630,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; if (isBaseWithConstantOffset64(Addr, N0, N1) && - isFlatScratchBaseLegal(N0, FlatVariant)) { + (FlatVariant != SIInstrFlags::FlatScratch || + isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1614,7 +1706,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, } VAddr = Addr; - Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); return true; } @@ -1682,7 +1774,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VOffset = SDValue(VMov, 0); SAddr = LHS; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1722,7 +1814,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } if (SAddr) { - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1738,7 +1830,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } @@ -1771,8 +1863,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, int64_t COffsetVal = 0; - if (CurDAG->isBaseWithConstantOffset(Addr) && - isFlatScratchBaseLegal(Addr.getOperand(0))) { + if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) { COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); } else { @@ -1829,6 +1920,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, int64_t ImmOffset = 0; SDValue LHS, RHS; + SDValue OrigAddr = Addr; if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1850,7 +1942,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; - if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + if (!isFlatScratchBaseLegal(Addr)) return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; @@ -1876,8 +1968,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; } - if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) - return false; + if (OrigAddr != Addr) { + if (!isFlatScratchBaseLegalSVImm(OrigAddr)) + return false; + } else { + if (!isFlatScratchBaseLegalSV(OrigAddr)) + return false; + } if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; @@ -2249,6 +2346,33 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { return false; } +static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) { + assert(VCMP->getOpcode() == AMDGPUISD::SETCC); + // Special case for amdgcn.ballot: + // %Cond = i1 (and/or combination of i1 ISD::SETCCs) + // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq + // => + // Use i1 %Cond value instead of i(WaveSize) %VCMP. + // This is possible because divergent ISD::SETCC is selected as V_CMP and + // Cond becomes a i(WaveSize) full mask value. + // Note that ballot doesn't use SETEQ condition but its easy to support it + // here for completeness, so in this case Negate is set true on return. + auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get(); + if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && + isNullConstant(VCMP.getOperand(1))) { + + auto Cond = VCMP.getOperand(0); + if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension. + Cond = Cond.getOperand(0); + + if (isBoolSGPR(Cond)) { + Negate = VCMP_CC == ISD::SETEQ; + return Cond; + } + } + return SDValue(); +} + void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { SDValue Cond = N->getOperand(1); @@ -2262,11 +2386,50 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { const SIRegisterInfo *TRI = ST->getRegisterInfo(); bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); - unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; + bool AndExec = !UseSCCBr; + bool Negate = false; + + if (Cond.getOpcode() == ISD::SETCC && + Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) { + SDValue VCMP = Cond->getOperand(0); + auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get(); + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && + isNullConstant(Cond->getOperand(1)) && + // TODO: make condition below an assert after fixing ballot bitwidth. + VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) { + // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... + // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq + // BRCOND i1 %C, %BB + // => + // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... + // VCC = COPY i(WaveSize) %VCMP + // S_CBRANCH_VCCNZ/VCCZ %BB + Negate = CC == ISD::SETEQ; + bool NegatedBallot = false; + if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) { + Cond = BallotCond; + UseSCCBr = !BallotCond->isDivergent(); + Negate = Negate ^ NegatedBallot; + } else { + // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always + // selected as V_CMP, but this may change for uniform condition. + Cond = VCMP; + UseSCCBr = false; + } + } + // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of + // V_CMPs resulted from ballot or ballot has uniform condition and SCC is + // used. + AndExec = false; + } + + unsigned BrOp = + UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1) + : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ); Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC(); SDLoc SL(N); - if (!UseSCCBr) { + if (AndExec) { // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not // analyzed what generates the vcc value, so we do not know whether vcc // bits for disabled lanes are 0. Thus we need to mask out bits for @@ -2296,6 +2459,22 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { VCC.getValue(0)); } +void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) { + if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 && + !N->isDivergent()) { + SDValue Src = N->getOperand(0); + if (Src.getValueType() == MVT::f16) { + if (isExtractHiElt(Src, Src)) { + CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(), + {Src}); + return; + } + } + } + + SelectCode(N); +} + void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { // The address is assumed to be uniform, so if it ends up in a VGPR, it will // be copied to an SGPR with readfirstlane. @@ -2369,8 +2548,9 @@ static unsigned gwsIntrinToOpcode(unsigned IntrID) { } void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { - if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && - !Subtarget->hasGWSSemaReleaseAll()) { + if (!Subtarget->hasGWS() || + (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !Subtarget->hasGWSSemaReleaseAll())) { // Let this error. SelectCode(N); return; @@ -2568,6 +2748,45 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { SelectCode(N); } +void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) { + SDValue Log2WaveSize = + CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32); + CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(), + {N->getOperand(0), Log2WaveSize}); +} + +void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) { + SDValue SrcVal = N->getOperand(1); + if (SrcVal.getValueType() != MVT::i32) { + SelectCode(N); // Emit default error + return; + } + + SDValue CopyVal; + Register SP = TLI->getStackPointerRegisterToSaveRestore(); + SDLoc SL(N); + + if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) { + CopyVal = SrcVal.getOperand(0); + } else { + SDValue Log2WaveSize = CurDAG->getTargetConstant( + Subtarget->getWavefrontSizeLog2(), SL, MVT::i32); + + if (N->isDivergent()) { + SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, + MVT::i32, SrcVal), + 0); + } + + CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, + {SrcVal, Log2WaveSize}), + 0); + } + + SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP); +} + bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods, bool IsCanonicalizing, @@ -2948,7 +3167,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (!RC || SIRI->isSGPRClass(RC)) return false; - if (RC != &AMDGPU::VS_32RegClass) { + if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { AllUsesAcceptSReg = false; SDNode * User = *U; if (User->isMachineOpcode()) { @@ -2960,7 +3179,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); - if (CommutedRC == &AMDGPU::VS_32RegClass) + if (CommutedRC == &AMDGPU::VS_32RegClass || + CommutedRC == &AMDGPU::VS_64RegClass) AllUsesAcceptSReg = true; } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 0605baf3a0cc..374108af08cd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -92,7 +92,7 @@ public: AMDGPUDAGToDAGISel() = delete; - explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel); + explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOptLevel OptLevel); ~AMDGPUDAGToDAGISel() override = default; void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -154,8 +154,10 @@ private: bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, unsigned Size) const; - bool isFlatScratchBaseLegal( - SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + + bool isFlatScratchBaseLegal(SDValue Addr) const; + bool isFlatScratchBaseLegalSV(SDValue Addr) const; + bool isFlatScratchBaseLegalSVImm(SDValue Addr) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -177,6 +179,7 @@ private: bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; + bool SelectBUFSOffset(SDValue Addr, SDValue &SOffset) const; bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset, uint64_t FlatVariant) const; @@ -273,6 +276,7 @@ private: bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); + void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDSBvhStackIntrinsic(SDNode *N); void SelectDS_GWS(SDNode *N, unsigned IntrID); @@ -280,6 +284,8 @@ private: void SelectINTRINSIC_W_CHAIN(SDNode *N); void SelectINTRINSIC_WO_CHAIN(SDNode *N); void SelectINTRINSIC_VOID(SDNode *N); + void SelectWAVE_ADDRESS(SDNode *N); + void SelectSTACKRESTORE(SDNode *N); protected: // Include the pieces autogenerated from the target description. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 39e00a037bdd..9d7443012e3d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -323,24 +323,26 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); - // This is totally unsupported, just custom lower to produce an error. + // For R600, this is totally unsupported, just custom lower to produce an + // error. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); // Library functions. These default to Expand, but we have instructions // for them. - setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT, - ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, + setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, + ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); - setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32, - Custom); + setOperationAction( + {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32, + Custom); setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); - setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom); + setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); @@ -351,7 +353,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); } - setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom); + setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, + Custom); // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches // scalarization code. Can be removed when IS_FPCLASS expand isn't called by @@ -383,7 +386,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64, - MVT::v16f64, MVT::v16i64}, + MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16}, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -456,14 +459,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, for (MVT VT : FloatVectorTypes) { setOperationAction( - {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, - ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2, - ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG, - ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC, - ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, - ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG, - ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, - ISD::SETCC, ISD::FCANONICALIZE}, + {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FADD, ISD::FCEIL, ISD::FCOS, + ISD::FDIV, ISD::FEXP2, ISD::FEXP, + ISD::FEXP10, ISD::FLOG2, ISD::FREM, + ISD::FLOG, ISD::FLOG10, ISD::FPOW, + ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL, + ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, + ISD::FSQRT, ISD::FSIN, ISD::FSUB, + ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC, + ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC, + ISD::FCANONICALIZE, ISD::FROUNDEVEN}, VT, Expand); } @@ -579,11 +585,14 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) { case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: case ISD::SELECT: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: case ISD::FNEARBYINT: + case ISD::FROUNDEVEN: case ISD::FCANONICALIZE: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: @@ -1001,6 +1010,9 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_LS: return CC_AMDGPU; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + return CC_AMDGPU_CS_CHAIN; case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: @@ -1024,6 +1036,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_LS: @@ -1315,6 +1329,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FLOG10: return LowerFLOGCommon(Op, DAG); case ISD::FEXP: + case ISD::FEXP10: return lowerFEXP(Op, DAG); case ISD::FEXP2: return lowerFEXP2(Op, DAG); @@ -1360,6 +1375,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Lowered); return; case ISD::FEXP: + case ISD::FEXP10: if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) Results.push_back(Lowered); return; @@ -1714,7 +1730,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, BaseAlign, Load->getMemOperand()->getFlags()); - SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size)); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), @@ -2362,7 +2378,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); } -SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -2389,18 +2406,19 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); } -SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, + SelectionDAG &DAG) const { // FNEARBYINT and FRINT are the same, except in their handling of FP // exceptions. Those aren't really meaningful for us, and OpenCL only has // rint, so just treat them as equivalent. - return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); + return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(), + Op.getOperand(0)); } -SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { auto VT = Op.getValueType(); auto Arg = Op.getOperand(0u); - return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg); + return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg); } // XXX - May require not supporting f32 denormals? @@ -2423,18 +2441,16 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); const SDValue One = DAG.getConstantFP(1.0, SL, VT); - const SDValue Half = DAG.getConstantFP(0.5, SL, VT); - - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + const SDValue Half = DAG.getConstantFP(0.5, SL, VT); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); - SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); - - return DAG.getNode(ISD::FADD, SL, VT, T, Sel); + SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); + return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); } SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { @@ -2468,7 +2484,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) { case ISD::FP_EXTEND: return Src.getOperand(0).getValueType() == MVT::f16; case ISD::FP16_TO_FP: + case ISD::FFREXP: return true; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_frexp_mant: + return true; + default: + return false; + } + } default: return false; } @@ -2476,15 +2503,17 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) { llvm_unreachable("covered opcode switch"); } -static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) { +bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, + SDNodeFlags Flags) { if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; return Options.UnsafeFPMath || Options.ApproxFuncFPMath; } -static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, - SDNodeFlags Flags) { +bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, + SDValue Src, + SDNodeFlags Flags) { return !valueIsKnownNeverF32Denorm(Src) && DAG.getMachineFunction() .getDenormalMode(APFloat::IEEEsingle()) @@ -2697,7 +2726,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const { EVT VT = Src.getValueType(); - unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2; + unsigned LogOp = + VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2; double Log2BaseInverted = IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; @@ -2782,14 +2812,95 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); } -SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, +SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const { - // exp2(M_LOG2E_F * f); - EVT VT = Op.getValueType(); - const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags); - return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul, + EVT VT = X.getValueType(); + const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); + + if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { + // exp2(M_LOG2E_F * f); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); + return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP + : (unsigned)ISD::FEXP2, + SL, VT, Mul, Flags); + } + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT); + SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); + + SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT); + + SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); + + SDValue AdjustedX = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); + + SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); + + SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); + + SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT); + SDValue AdjustedResult = + DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags); + + return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2, + Flags); +} + +/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be +/// handled correctly. +SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, + SelectionDAG &DAG, + SDNodeFlags Flags) const { + const EVT VT = X.getValueType(); + const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2; + + if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { + // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); + SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); + SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); + + SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags); + SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); + SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags); + SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); + return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1); + } + + // bool s = x < -0x1.2f7030p+5f; + // x += s ? 0x1.0p+5f : 0.0f; + // exp10 = exp2(x * 0x1.a92000p+1f) * + // exp2(x * 0x1.4f0978p-11f) * + // (s ? 0x1.9f623ep-107f : 1.0f); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT); + SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); + + SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT); + SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); + SDValue AdjustedX = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); + + SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); + SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); + + SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags); + SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); + SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags); + SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); + + SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags); + + SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT); + SDValue AdjustedResult = + DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags); + + return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps, Flags); } @@ -2798,7 +2909,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); SDNodeFlags Flags = Op->getFlags(); - const bool IsExp10 = false; // TODO: For some reason exp10 is missing + const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; if (VT.getScalarType() == MVT::f16) { // v_exp_f16 (fmul x, log2e) @@ -2822,9 +2933,9 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying // library behavior. Also, is known-not-daz source sufficient? - if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) { - assert(!IsExp10 && "todo exp10 support"); - return lowerFEXPUnsafe(X, SL, DAG, Flags); + if (allowApproxFunc(DAG, Flags)) { + return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) + : lowerFEXPUnsafe(X, SL, DAG, Flags); } // Algorithm: @@ -2891,7 +3002,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); } - SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags); + SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags); // It is unsafe to contract this fsub into the PH multiply. SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); @@ -3698,8 +3809,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: - case Intrinsic::amdgcn_rsq_clamp: - case Intrinsic::amdgcn_ldexp: { + case Intrinsic::amdgcn_rsq_clamp: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); @@ -4012,8 +4122,7 @@ static SDValue getAddOneOp(const SDNode *V) { if (V->getOpcode() != ISD::ADD) return SDValue(); - auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1)); - return C && C->isOne() ? V->getOperand(0) : SDValue(); + return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue(); } SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, @@ -4243,8 +4352,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { - ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (!CmpRhs || !CmpRhs->isZero()) + if (!isNullConstant(Cond.getOperand(1))) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -4466,6 +4574,10 @@ static unsigned inverseMinMax(unsigned Opc) { return ISD::FMINNUM_IEEE; case ISD::FMINNUM_IEEE: return ISD::FMAXNUM_IEEE; + case ISD::FMAXIMUM: + return ISD::FMINIMUM; + case ISD::FMINIMUM: + return ISD::FMAXIMUM; case AMDGPUISD::FMAX_LEGACY: return AMDGPUISD::FMIN_LEGACY; case AMDGPUISD::FMIN_LEGACY: @@ -4589,6 +4701,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: case ISD::FMINNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMIN_LEGACY: { // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) @@ -4638,6 +4752,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FTRUNC: case ISD::FRINT: case ISD::FNEARBYINT: // XXX - Should fround be handled? + case ISD::FROUNDEVEN: case ISD::FSIN: case ISD::FCANONICALIZE: case AMDGPUISD::RCP: @@ -4999,6 +5114,36 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performAssertSZExtCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicWOChainCombine(N, DCI); + case AMDGPUISD::FMAD_FTZ: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); + + // FMAD_FTZ is a FMAD + flush denormals to zero. + // We flush the inputs, the intermediate step, and the output. + ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); + ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); + if (N0CFP && N1CFP && N2CFP) { + const auto FTZ = [](const APFloat &V) { + if (V.isDenormal()) { + APFloat Zero(V.getSemantics(), 0); + return V.isNegative() ? -Zero : Zero; + } + return V; + }; + + APFloat V0 = FTZ(N0CFP->getValueAPF()); + APFloat V1 = FTZ(N1CFP->getValueAPF()); + APFloat V2 = FTZ(N2CFP->getValueAPF()); + V0.multiply(V1, APFloat::rmNearestTiesToEven); + V0 = FTZ(V0); + V0.add(V2, APFloat::rmNearestTiesToEven); + return DAG.getConstantFP(FTZ(V0), DL, VT); + } + break; + } } return SDValue(); } @@ -5140,8 +5285,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CALL) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TC_RETURN_GFX) + NODE_NAME_CASE(TC_RETURN_CHAIN) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_GLUE) + NODE_NAME_CASE(WAVE_ADDRESS) NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) NODE_NAME_CASE(ENDPGM_TRAP) @@ -5166,6 +5313,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) + NODE_NAME_CASE(FMAXIMUM3) + NODE_NAME_CASE(FMINIMUM3) NODE_NAME_CASE(FDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) @@ -5620,6 +5769,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case AMDGPUISD::FMED3: case AMDGPUISD::FMIN3: case AMDGPUISD::FMAX3: + case AMDGPUISD::FMINIMUM3: + case AMDGPUISD::FMAXIMUM3: case AMDGPUISD::FMAD_FTZ: { if (SNaN) return true; @@ -5734,12 +5885,6 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } -bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal( - unsigned Opc, LLT Ty1, LLT Ty2) const { - return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) && - Ty2 == LLT::scalar(32); -} - /// Whether it is profitable to sink the operands of an /// Instruction I to the basic block of I. /// This helps using several modifiers (like abs and neg) more often. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index c39093b9bb6b..827fb106b551 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -61,6 +61,9 @@ protected: SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags); + static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, + SDNodeFlags Flags); SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const; SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const; @@ -77,6 +80,8 @@ protected: SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const; + SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, + SDNodeFlags Flags) const; SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; @@ -242,9 +247,7 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; - SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const; - + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, @@ -371,9 +374,6 @@ public: AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, - LLT Ty2) const override; - bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; }; @@ -391,6 +391,7 @@ enum NodeType : unsigned { CALL, TC_RETURN, TC_RETURN_GFX, + TC_RETURN_CHAIN, TRAP, // Masked control flow nodes. @@ -410,6 +411,10 @@ enum NodeType : unsigned { // Return with values from a non-entry function. RET_GLUE, + // Convert a unswizzled wave uniform stack address to an address compatible + // with a vector offset for use in stack access. + WAVE_ADDRESS, + DWORDADDR, FRACT, @@ -444,6 +449,8 @@ enum NodeType : unsigned { FMED3, SMED3, UMED3, + FMAXIMUM3, + FMINIMUM3, FDOT2, URECIP, DIV_SCALE, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp new file mode 100644 index 000000000000..e5fbcca1e7d1 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -0,0 +1,336 @@ +//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa +// or dim=2darraymsaa into a single image_msaa_load intrinsic if: +// +// - they refer to the same vaddr except for sample_id, +// - they use a constant sample_id and they fall into the same group, +// - they have the same dmask and the number of intrinsics and the number of +// vaddr/vdata dword transfers is reduced by the combine. +// +// Examples for the tradeoff (all are assuming 2DMsaa for vaddr): +// +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | +// | (dmask) | | | | vdata | | vdata | | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// +// Some cases are of questionable benefit, like the one marked with "yes?" +// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP +// and TX, but higher vdata. We start by erring on the side of converting these +// to MSAA_LOAD. +// +// clang-format off +// +// This pass will combine intrinsics such as (not neccessarily consecutive): +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) +// ==> +// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) +// +// clang-format on +// +// Future improvements: +// +// - We may occasionally not want to do the combine if it increases the maximum +// register pressure. +// +// - Ensure clausing when multiple MSAA_LOAD are generated. +// +// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this +// combine only applies to gfx11, due to a limitation in gfx10: the gfx10 +// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and +// we don't know the format at compile time. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-image-intrinsic-opt" + +namespace { +class AMDGPUImageIntrinsicOptimizer : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; + + AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) {} + + bool runOnFunction(Function &F) override; + +}; // End of class AMDGPUImageIntrinsicOptimizer +} // End anonymous namespace + +INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, + "AMDGPU Image Intrinsic Optimizer", false, false) + +char AMDGPUImageIntrinsicOptimizer::ID = 0; + +void addInstToMergeableList( + IntrinsicInst *II, + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { + for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { + // Check Dim. + if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) + continue; + + // Check D16. + if (IIList.front()->getType() != II->getType()) + continue; + + // Check all arguments (DMask, VAddr, RSrc etc). + bool AllEqual = true; + assert(IIList.front()->arg_size() == II->arg_size()); + for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { + Value *ArgList = IIList.front()->getArgOperand(I); + Value *Arg = II->getArgOperand(I); + if (I == ImageDimIntr->VAddrEnd - 1) { + // Check FragId group. + auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I)); + auto FragId = cast<ConstantInt>(II->getArgOperand(I)); + AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); + } else { + // Check all arguments except FragId. + AllEqual = ArgList == Arg; + } + } + if (!AllEqual) + continue; + + // Add to the list. + IIList.emplace_back(II); + return; + } + + // Similar instruction not found, so add a new list. + MergeableInsts.emplace_back(1, II); + LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); +} + +// Collect list of all instructions we know how to merge in a subset of the +// block. It returns an iterator to the instruction after the last one analyzed. +BasicBlock::iterator collectMergeableInsts( + BasicBlock::iterator I, BasicBlock::iterator E, + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { + for (; I != E; ++I) { + // Don't combine if there is a store in the middle or if there is a memory + // barrier. + if (I->mayHaveSideEffects()) { + ++I; + break; + } + + // Ignore non-intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + Intrinsic::ID IntrinID = II->getIntrinsicID(); + + // Ignore other intrinsics. + if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && + IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) + continue; + + // Check for constant FragId. + const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; + if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) + continue; + + LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); + addInstToMergeableList(II, MergeableInsts, ImageDimIntr); + } + } + + return I; +} + +bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { + bool Modified = false; + + SmallVector<Instruction *, 4> InstrsToErase; + for (const auto &IIList : MergeableInsts) { + if (IIList.size() <= 1) + continue; + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector<Value *, 16> Args(IIList.front()->args()); + + // Validate function argument and return types, extracting overloaded + // types along the way. + SmallVector<Type *, 6> OverloadTys; + Function *F = IIList.front()->getCalledFunction(); + if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) + continue; + + Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinID); + + Type *EltTy = IIList.front()->getType()->getScalarType(); + Type *NewTy = FixedVectorType::get(EltTy, 4); + OverloadTys[0] = NewTy; + bool isD16 = EltTy->isHalfTy(); + + ConstantInt *DMask = cast<ConstantInt>( + IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + unsigned NumElts = popcount(DMaskVal); + + // Number of instructions and the number of vaddr/vdata dword transfers + // should be reduced. + unsigned NumLoads = IIList.size(); + unsigned NumMsaas = NumElts; + unsigned NumVAddrLoads = 3 * NumLoads; + unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; + unsigned NumVAddrMsaas = 3 * NumMsaas; + unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; + + if (NumLoads < NumMsaas || + (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) + continue; + + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; + auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); + const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; + + // Create the new instructions. + IRBuilder<> B(IIList.front()); + + // Create the new image_msaa_load intrinsic. + SmallVector<Instruction *, 4> NewCalls; + while (DMaskVal != 0) { + unsigned NewMaskVal = 1 << countr_zero(DMaskVal); + + Intrinsic::ID NewIntrinID; + if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; + else + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; + + Function *NewIntrin = Intrinsic::getDeclaration( + IIList.front()->getModule(), NewIntrinID, OverloadTys); + Args[ImageDimIntr->DMaskIndex] = + ConstantInt::get(DMask->getType(), NewMaskVal); + Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); + CallInst *NewCall = B.CreateCall(NewIntrin, Args); + LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); + + NewCalls.push_back(NewCall); + DMaskVal -= NewMaskVal; + } + + // Create the new extractelement instructions. + for (auto &II : IIList) { + Value *VecOp = nullptr; + auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); + B.SetCurrentDebugLocation(II->getDebugLoc()); + if (NumElts == 1) { + VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); + } else { + VecOp = UndefValue::get(II->getType()); + for (unsigned I = 0; I < NumElts; ++I) { + VecOp = B.CreateInsertElement( + VecOp, + B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); + } + } + + // Replace the old instruction. + II->replaceAllUsesWith(VecOp); + VecOp->takeName(II); + InstrsToErase.push_back(II); + } + + Modified = true; + } + + for (auto I : InstrsToErase) + I->eraseFromParent(); + + return Modified; +} + +static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { + if (!TM) + return false; + + // This optimization only applies to GFX11 and beyond. + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) + return false; + + Module *M = F.getParent(); + + // Early test to determine if the intrinsics are used. + if (std::none_of(M->begin(), M->end(), [](Function &F) { + return !F.users().empty() && + (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || + F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); + })) + return false; + + bool Modified = false; + for (auto &BB : F) { + BasicBlock::iterator SectionEnd; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; + I = SectionEnd) { + SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; + + SectionEnd = collectMergeableInsts(I, E, MergeableInsts); + Modified |= optimizeSection(MergeableInsts); + } + } + + return Modified; +} + +bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + return imageIntrinsicOptimizerImpl(F, TM); +} + +FunctionPass * +llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { + return new AMDGPUImageIntrinsicOptimizer(TM); +} + +PreservedAnalyses +AMDGPUImageIntrinsicOptimizerPass::run(Function &F, + FunctionAnalysisManager &AM) { + + bool Changed = imageIntrinsicOptimizerImpl(F, &TM); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp new file mode 100644 index 000000000000..93ed77bb6f7e --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp @@ -0,0 +1,122 @@ +//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU +/// instructions that produce single-use VGPR values. If the value is forwarded +/// to the consumer instruction prior to VGPR writeback, the hardware can +/// then skip (kill) the VGPR write. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCRegister.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-insert-single-use-vdst" + +namespace { +class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { +private: + const SIInstrInfo *SII; + +public: + static char ID; + + AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} + + void emitSingleUseVDST(MachineInstr &MI) const { + // Mark the following instruction as a single-use producer: + // s_singleuse_vdst { supr0: 1 } + BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST)) + .addImm(0x1); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + const auto &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasVGPRSingleUseHintInsts()) + return false; + + SII = ST.getInstrInfo(); + const auto *TRI = &SII->getRegisterInfo(); + bool InstructionEmitted = false; + + for (MachineBasicBlock &MBB : MF) { + DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits + + // Handle boundaries at the end of basic block separately to avoid + // false positives. If they are live at the end of a basic block then + // assume it has more uses later on. + for (const auto &Liveouts : MBB.liveouts()) + RegisterUseCount[Liveouts.PhysReg] = 2; + + for (MachineInstr &MI : reverse(MBB.instrs())) { + // All registers in all operands need to be single use for an + // instruction to be marked as a single use producer. + bool AllProducerOperandsAreSingleUse = true; + + for (const auto &Operand : MI.operands()) { + if (!Operand.isReg()) + continue; + const auto Reg = Operand.getReg(); + + // Count the number of times each register is read. + if (Operand.readsReg()) + RegisterUseCount[Reg]++; + + // Do not attempt to optimise across exec mask changes. + if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { + for (auto &UsedReg : RegisterUseCount) + UsedReg.second = 2; + } + + // If we are at the point where the register first became live, + // check if the operands are single use. + if (!MI.modifiesRegister(Reg, TRI)) + continue; + if (RegisterUseCount[Reg] > 1) + AllProducerOperandsAreSingleUse = false; + // Reset uses count when a register is no longer live. + RegisterUseCount.erase(Reg); + } + if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) { + // TODO: Replace with candidate logging for instruction grouping + // later. + emitSingleUseVDST(MI); + InstructionEmitted = true; + } + } + } + return InstructionEmitted; + } +}; +} // namespace + +char AMDGPUInsertSingleUseVDST::ID = 0; + +char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; + +INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, + "AMDGPU Insert SingleUseVDST", false, false) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 3c399e497227..ee93d9eb4c0a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -408,6 +408,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, int DMaskIdx = -1, bool IsLoad = true); +/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt) +static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) { + return (SqrtOp->getType()->isFloatTy() && + (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) || + SqrtOp->getType()->isHalfTy(); +} + std::optional<Instruction *> GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -437,6 +444,37 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); } + FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags(); + if (!FMF.allowContract()) + break; + auto *SrcCI = dyn_cast<IntrinsicInst>(Src); + if (!SrcCI) + break; + + auto IID = SrcCI->getIntrinsicID(); + // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable + // + // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and + // relaxed. + if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) { + const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI); + FastMathFlags InnerFMF = SqrtOp->getFastMathFlags(); + if (!InnerFMF.allowContract() || !SrcCI->hasOneUse()) + break; + + if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) + break; + + Function *NewDecl = Intrinsic::getDeclaration( + SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); + + InnerFMF |= FMF; + II.setFastMathFlags(InnerFMF); + + II.setCalledFunction(NewDecl); + return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0)); + } + break; } case Intrinsic::amdgcn_sqrt: @@ -450,6 +488,14 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, QNaN); } + // f16 amdgcn.sqrt is identical to regular sqrt. + if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { + Function *NewDecl = Intrinsic::getDeclaration( + II.getModule(), Intrinsic::sqrt, {II.getType()}); + II.setCalledFunction(NewDecl); + return &II; + } + break; } case Intrinsic::amdgcn_log: @@ -784,7 +830,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); if (CCmp->isNullValue()) { return IC.replaceInstUsesWith( - II, ConstantExpr::getSExt(CCmp, II.getType())); + II, IC.Builder.CreateSExt(CCmp, II.getType())); } // The result of V_ICMP/V_FCMP assembly instructions (which this @@ -946,14 +992,27 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); } case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { + case Intrinsic::amdgcn_permlane16_var: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlanex16_var: { // Discard vdst_in if it's not going to be read. Value *VDstIn = II.getArgOperand(0); if (isa<UndefValue>(VDstIn)) break; - ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); - ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); + // FetchInvalid operand idx. + unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 || + IID == Intrinsic::amdgcn_permlanex16) + ? 4 /* for permlane16 and permlanex16 */ + : 3; /* for permlane16_var and permlanex16_var */ + + // BoundCtrl operand idx. + // For permlane16 and permlanex16 it should be 5 + // For Permlane16_var and permlanex16_var it should be 4 + unsigned int BcIdx = FiIdx + 1; + + ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx)); + ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx)); if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) break; @@ -1002,50 +1061,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } - case Intrinsic::amdgcn_ldexp: { - // FIXME: This doesn't introduce new instructions and belongs in - // InstructionSimplify. - Type *Ty = II.getType(); - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - - // Folding undef to qnan is safe regardless of the FP mode. - if (isa<UndefValue>(Op0)) { - auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); - return IC.replaceInstUsesWith(II, QNaN); - } - - const APFloat *C = nullptr; - match(Op0, PatternMatch::m_APFloat(C)); - - // FIXME: Should flush denorms depending on FP mode, but that's ignored - // everywhere else. - // - // These cases should be safe, even with strictfp. - // ldexp(0.0, x) -> 0.0 - // ldexp(-0.0, x) -> -0.0 - // ldexp(inf, x) -> inf - // ldexp(-inf, x) -> -inf - if (C && (C->isZero() || C->isInfinity())) { - return IC.replaceInstUsesWith(II, Op0); - } - - // With strictfp, be more careful about possibly needing to flush denormals - // or not, and snan behavior depends on ieee_mode. - if (II.isStrictFP()) - break; - - if (C && C->isNaN()) - return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet())); - - // ldexp(x, 0) -> x - // ldexp(x, undef) -> x - if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { - return IC.replaceInstUsesWith(II, Op0); - } - - break; - } case Intrinsic::amdgcn_fmul_legacy: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index f2d62956e25b..d41e704a4a11 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -14,6 +14,7 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPU.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instruction.h" @@ -26,6 +27,9 @@ using namespace llvm; AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { } +Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) { + return I.getOperand(I.getNumExplicitDefs()).getIntrinsicID(); +} // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index e7ee36447682..515decea3921 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -21,6 +21,7 @@ namespace llvm { class GCNSubtarget; class MachineMemOperand; +class MachineInstr; class AMDGPUInstrInfo { public: @@ -31,6 +32,13 @@ public: namespace AMDGPU { +/// Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix. +/// +/// These opcodes have an Intrinsic::ID operand similar to a GIntrinsic. But +/// they are not actual instances of GIntrinsics, so we cannot use +/// GIntrinsic::getIntrinsicID() on them. +unsigned getIntrinsicID(const MachineInstr &I); + struct RsrcIntrinsic { unsigned Intr; uint8_t RsrcArg; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b69cae0c73b3..82f58ea38fd0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; +def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] @@ -165,6 +170,11 @@ def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, [/*SDNPCommutative, SDNPAssociative*/] >; +// out = max(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant. +def AMDGPUfmaximum3 : SDNode<"AMDGPUISD::FMAXIMUM3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + // out = max(a, b, c) a, b, and c are signed ints def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, [/*SDNPCommutative, SDNPAssociative*/] @@ -180,6 +190,11 @@ def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, [/*SDNPCommutative, SDNPAssociative*/] >; +// out = min(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant. +def AMDGPUfminimum3 : SDNode<"AMDGPUISD::FMINIMUM3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + // out = min(a, b, c) a, b and c are signed ints def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, [/*SDNPCommutative, SDNPAssociative*/] @@ -265,9 +280,6 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def AMDGPUround : SDNode<"ISD::FROUND", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; - def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; @@ -279,11 +291,15 @@ def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore -// when performing the multiply. The result is a 32-bit value. -def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, +// when performing the multiply. The result is a 32 or 64 bit value. +def AMDGPUMul24Op : SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2> +]>; + +def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", AMDGPUMul24Op, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, +def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", AMDGPUMul24Op, [SDNPCommutative, SDNPAssociative] >; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 747f9fe2f8ae..88ef4b577424 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -21,6 +21,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -71,6 +72,13 @@ void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); } +// Return the wave level SGPR base address if this is a wave address. +static Register getWaveAddress(const MachineInstr *Def) { + return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS + ? Def->getOperand(1).getReg() + : Register(); +} + bool AMDGPUInstructionSelector::isVCC(Register Reg, const MachineRegisterInfo &MRI) const { // The verifier is oblivious to s1 being a valid value for wavesize registers. @@ -158,11 +166,15 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { // TODO: Skip masking high bits if def is known boolean. + bool IsSGPR = TRI.isSGPRClass(SrcRC); unsigned AndOpc = - TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; - BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) + IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; + auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) .addImm(1) .addReg(SrcReg); + if (IsSGPR) + And.setOperandDead(3); // Dead scc + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) .addImm(0) .addReg(MaskedReg); @@ -322,7 +334,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineInstr *Add = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) .add(I.getOperand(1)) - .add(I.getOperand(2)); + .add(I.getOperand(2)) + .setOperandDead(3); // Dead scc I.eraseFromParent(); return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); } @@ -369,7 +382,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { .add(Lo2); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) .add(Hi1) - .add(Hi2); + .add(Hi2) + .setOperandDead(3); // Dead scc } else { const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); Register CarryReg = MRI->createVirtualRegister(CarryRC); @@ -436,14 +450,18 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) + auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) .add(I.getOperand(2)) .add(I.getOperand(3)); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) - .addReg(AMDGPU::SCC); - if (!MRI->getRegClassOrNull(Dst1Reg)) - MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + if (MRI->use_nodbg_empty(Dst1Reg)) { + CarryInst.setOperandDead(3); // Dead scc + } else { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) + .addReg(AMDGPU::SCC); + if (!MRI->getRegClassOrNull(Dst1Reg)) + MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); + } if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || @@ -740,7 +758,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc MI.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); @@ -1001,7 +1020,7 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { } bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { - unsigned IntrinsicID = I.getIntrinsicID(); + unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_if_break: { MachineBasicBlock *BB = I.getParent(); @@ -1192,36 +1211,104 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, } } - if (Size != 32) - return -1; + if (Size == 32) { + switch (P) { + case CmpInst::ICMP_NE: + return AMDGPU::S_CMP_LG_U32; + case CmpInst::ICMP_EQ: + return AMDGPU::S_CMP_EQ_U32; + case CmpInst::ICMP_SGT: + return AMDGPU::S_CMP_GT_I32; + case CmpInst::ICMP_SGE: + return AMDGPU::S_CMP_GE_I32; + case CmpInst::ICMP_SLT: + return AMDGPU::S_CMP_LT_I32; + case CmpInst::ICMP_SLE: + return AMDGPU::S_CMP_LE_I32; + case CmpInst::ICMP_UGT: + return AMDGPU::S_CMP_GT_U32; + case CmpInst::ICMP_UGE: + return AMDGPU::S_CMP_GE_U32; + case CmpInst::ICMP_ULT: + return AMDGPU::S_CMP_LT_U32; + case CmpInst::ICMP_ULE: + return AMDGPU::S_CMP_LE_U32; + case CmpInst::FCMP_OEQ: + return AMDGPU::S_CMP_EQ_F32; + case CmpInst::FCMP_OGT: + return AMDGPU::S_CMP_GT_F32; + case CmpInst::FCMP_OGE: + return AMDGPU::S_CMP_GE_F32; + case CmpInst::FCMP_OLT: + return AMDGPU::S_CMP_LT_F32; + case CmpInst::FCMP_OLE: + return AMDGPU::S_CMP_LE_F32; + case CmpInst::FCMP_ONE: + return AMDGPU::S_CMP_LG_F32; + case CmpInst::FCMP_ORD: + return AMDGPU::S_CMP_O_F32; + case CmpInst::FCMP_UNO: + return AMDGPU::S_CMP_U_F32; + case CmpInst::FCMP_UEQ: + return AMDGPU::S_CMP_NLG_F32; + case CmpInst::FCMP_UGT: + return AMDGPU::S_CMP_NLE_F32; + case CmpInst::FCMP_UGE: + return AMDGPU::S_CMP_NLT_F32; + case CmpInst::FCMP_ULT: + return AMDGPU::S_CMP_NGE_F32; + case CmpInst::FCMP_ULE: + return AMDGPU::S_CMP_NGT_F32; + case CmpInst::FCMP_UNE: + return AMDGPU::S_CMP_NEQ_F32; + default: + llvm_unreachable("Unknown condition code!"); + } + } - switch (P) { - case CmpInst::ICMP_NE: - return AMDGPU::S_CMP_LG_U32; - case CmpInst::ICMP_EQ: - return AMDGPU::S_CMP_EQ_U32; - case CmpInst::ICMP_SGT: - return AMDGPU::S_CMP_GT_I32; - case CmpInst::ICMP_SGE: - return AMDGPU::S_CMP_GE_I32; - case CmpInst::ICMP_SLT: - return AMDGPU::S_CMP_LT_I32; - case CmpInst::ICMP_SLE: - return AMDGPU::S_CMP_LE_I32; - case CmpInst::ICMP_UGT: - return AMDGPU::S_CMP_GT_U32; - case CmpInst::ICMP_UGE: - return AMDGPU::S_CMP_GE_U32; - case CmpInst::ICMP_ULT: - return AMDGPU::S_CMP_LT_U32; - case CmpInst::ICMP_ULE: - return AMDGPU::S_CMP_LE_U32; - default: - llvm_unreachable("Unknown condition code!"); + if (Size == 16) { + if (!STI.hasSALUFloatInsts()) + return -1; + + switch (P) { + case CmpInst::FCMP_OEQ: + return AMDGPU::S_CMP_EQ_F16; + case CmpInst::FCMP_OGT: + return AMDGPU::S_CMP_GT_F16; + case CmpInst::FCMP_OGE: + return AMDGPU::S_CMP_GE_F16; + case CmpInst::FCMP_OLT: + return AMDGPU::S_CMP_LT_F16; + case CmpInst::FCMP_OLE: + return AMDGPU::S_CMP_LE_F16; + case CmpInst::FCMP_ONE: + return AMDGPU::S_CMP_LG_F16; + case CmpInst::FCMP_ORD: + return AMDGPU::S_CMP_O_F16; + case CmpInst::FCMP_UNO: + return AMDGPU::S_CMP_U_F16; + case CmpInst::FCMP_UEQ: + return AMDGPU::S_CMP_NLG_F16; + case CmpInst::FCMP_UGT: + return AMDGPU::S_CMP_NLE_F16; + case CmpInst::FCMP_UGE: + return AMDGPU::S_CMP_NLT_F16; + case CmpInst::FCMP_ULT: + return AMDGPU::S_CMP_NGE_F16; + case CmpInst::FCMP_ULE: + return AMDGPU::S_CMP_NGT_F16; + case CmpInst::FCMP_UNE: + return AMDGPU::S_CMP_NEQ_F16; + default: + llvm_unreachable("Unknown condition code!"); + } } + + return -1; } -bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); @@ -1247,6 +1334,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { return Ret; } + if (I.getOpcode() == AMDGPU::G_FCMP) + return false; + int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); if (Opcode == -1) return false; @@ -1569,8 +1659,8 @@ static unsigned gwsIntrinToOpcode(unsigned IntrID) { bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const { - if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && - !STI.hasGWSSemaReleaseAll()) + if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !STI.hasGWSSemaReleaseAll())) return false; // intrinsic ID, vsrc, offset @@ -1629,7 +1719,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) .addReg(BaseOffset) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0Base); @@ -1690,7 +1781,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, } bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { - if (TM.getOptLevel() > CodeGenOpt::None) { + if (TM.getOptLevel() > CodeGenOptLevel::None) { unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; if (WGSize <= STI.getWavefrontSize()) { MachineBasicBlock *MBB = MI.getParent(); @@ -1700,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { return true; } } + + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait + if (STI.hasSplitBarriers()) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) + .addImm(AMDGPU::Barrier::WORKGROUP); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) + .addImm(AMDGPU::Barrier::WORKGROUP); + MI.eraseFromParent(); + return true; + } + return selectImpl(MI, *CoverageInfo); } @@ -1728,6 +1832,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); + const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; @@ -1812,7 +1917,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~AMDGPU::CPol::ALL) + if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) return false; int NumVAddrRegs = 0; @@ -1847,7 +1952,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( ++NumVDataDwords; int Opcode = -1; - if (IsGFX11Plus) { + if (IsGFX12Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX11Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx11NSA : AMDGPU::MIMGEncGfx11Default, @@ -1920,7 +2028,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (IsGFX10Plus) MIB.addImm(DimInfo->Encoding); - MIB.addImm(Unorm); + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm)) + MIB.addImm(Unorm); MIB.addImm(CPol); MIB.addImm(IsA16 && // a16 or r128 @@ -1935,7 +2044,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( return false; } - MIB.addImm(LWE); // lwe + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe)) + MIB.addImm(LWE); // lwe if (!IsGFX10Plus) MIB.addImm(DimInfo->DA ? -1 : 0); if (BaseOpcode->HasD16) @@ -2008,7 +2118,7 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { - unsigned IntrinsicID = I.getIntrinsicID(); + unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); switch (IntrinsicID) { case Intrinsic::amdgcn_end_cf: return selectEndCfIntrinsic(I); @@ -2046,6 +2156,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( break; case Intrinsic::amdgcn_ds_bvh_stack_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + case Intrinsic::amdgcn_s_get_barrier_state: + return selectNamedBarrierInst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + return selectSBarrierSignalIsfirst(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_leave: + return selectSBarrierLeave(I); } return selectImpl(I, *CoverageInfo); } @@ -2194,7 +2314,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { } else { BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) .addReg(HiReg) - .addImm(16); + .addImm(16) + .setOperandDead(3); // Dead scc } unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; @@ -2203,12 +2324,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) .addImm(0xffff); - BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) + auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) .addReg(LoReg) .addReg(ImmReg); - BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) + auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) .addReg(TmpReg0) .addReg(TmpReg1); + + if (!IsVALU) { + And.setOperandDead(3); // Dead scc + Or.setOperandDead(3); // Dead scc + } } I.eraseFromParent(); @@ -2353,7 +2479,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { if (Signed) { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) .addReg(SrcReg, 0, SubReg) - .addImm(31); + .addImm(31) + .setOperandDead(3); // Dead scc } else { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) .addImm(0); @@ -2397,7 +2524,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { if (!Signed && shouldUseAndMask(SrcSize, Mask)) { BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) .addReg(SrcReg) - .addImm(Mask); + .addImm(Mask) + .setOperandDead(3); // Dead scc } else { BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) .addReg(SrcReg) @@ -2411,16 +2539,54 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return false; } +static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register LShlSrc; + if (mi_match(In, MRI, + m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) { + Out = LShlSrc; + return true; + } + return false; +} + +bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { + if (!Subtarget->hasSALUFloatInsts()) + return false; + + Register Dst = I.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); + if (DstRB->getID() != AMDGPU::SGPRRegBankID) + return false; + + Register Src = I.getOperand(1).getReg(); + + if (MRI->getType(Dst) == LLT::scalar(32) && + MRI->getType(Src) == LLT::scalar(16)) { + if (isExtractHiElt(*MRI, Src, Src)) { + MachineBasicBlock *BB = I.getParent(); + BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) + .addUse(Src); + I.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + + return false; +} + bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineOperand &ImmOp = I.getOperand(1); Register DstReg = I.getOperand(0).getReg(); unsigned Size = MRI->getType(DstReg).getSizeInBits(); + bool IsFP = false; // The AMDGPU backend only supports Imm operands and not CImm or FPImm. if (ImmOp.isFPImm()) { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); + IsFP = true; } else if (ImmOp.isCImm()) { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); } else { @@ -2433,6 +2599,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { unsigned Opcode; if (DstRB->getID() == AMDGPU::VCCRegBankID) { Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + } else if (Size == 64 && + AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) { + Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO; + I.setDesc(TII.get(Opcode)); + I.addImplicitDefUseOperands(*MF); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } else { Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; @@ -2531,7 +2703,8 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) .addReg(HiReg) - .addReg(ConstReg); + .addReg(ConstReg) + .setOperandDead(3); // Dead scc BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) .addReg(LoReg) .addImm(AMDGPU::sub0) @@ -2572,7 +2745,8 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { // TODO: Should this used S_BITSET0_*? BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) .addReg(HiReg) - .addReg(ConstReg); + .addReg(ConstReg) + .setOperandDead(3); // Dead scc BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) .addReg(LoReg) .addImm(AMDGPU::sub0) @@ -2689,8 +2863,8 @@ static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { return isVCmpResult(MI.getOperand(1).getReg(), MRI) && isVCmpResult(MI.getOperand(2).getReg(), MRI); - if (Opcode == TargetOpcode::G_INTRINSIC) - return MI.getIntrinsicID() == Intrinsic::amdgcn_class; + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) + return GI->is(Intrinsic::amdgcn_class); return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; } @@ -2730,7 +2904,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) .addReg(CondReg) - .addReg(Exec); + .addReg(Exec) + .setOperandDead(3); // Dead scc CondReg = TmpReg; } @@ -2793,7 +2968,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { !CanCopyLow32 && !CanCopyHi32) { auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) .addReg(SrcReg) - .addReg(MaskReg); + .addReg(MaskReg) + .setOperandDead(3); // Dead scc I.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } @@ -2816,9 +2992,12 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { assert(MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"); - BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) + auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) .addReg(SrcReg) .addReg(MaskReg); + + if (!IsVGPR) + NewOp.setOperandDead(3); // Dead scc I.eraseFromParent(); return true; } @@ -3050,6 +3229,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( } bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { + assert(!AMDGPU::isGFX12Plus(STI)); unsigned Opc; unsigned Size = MI.getOperand(3).getImm(); @@ -3116,8 +3296,8 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { MIB.add(MI.getOperand(5 + OpOffset)); // soffset MIB.add(MI.getOperand(6 + OpOffset)); // imm offset unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); - MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol - MIB.addImm((Aux >> 3) & 1); // swz + MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol + MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz MachineMemOperand *LoadMMO = *MI.memoperands_begin(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); @@ -3252,7 +3432,7 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { unsigned Opc; - switch (MI.getIntrinsicID()) { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; break; @@ -3324,7 +3504,8 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) .addReg(SrcReg) - .addImm(Subtarget->getWavefrontSizeLog2()); + .addImm(Subtarget->getWavefrontSizeLog2()) + .setOperandDead(3); // Dead scc } const TargetRegisterClass &RC = @@ -3336,6 +3517,33 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { return true; } +bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { + Register SrcReg = MI.getOperand(0).getReg(); + if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); + Register SP = + Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); + Register WaveAddr = getWaveAddress(DefMI); + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!WaveAddr) { + WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) + .addReg(SrcReg) + .addImm(Subtarget->getWavefrontSizeLog2()) + .setOperandDead(3); // Dead scc + } + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) + .addReg(WaveAddr); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3402,11 +3610,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_INSERT: return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_CONVERGENT: return selectG_INTRINSIC(I); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I); case TargetOpcode::G_ICMP: - if (selectG_ICMP(I)) + case TargetOpcode::G_FCMP: + if (selectG_ICMP_or_FCMP(I)) return true; return selectImpl(I, *CoverageInfo); case TargetOpcode::G_LOAD: @@ -3443,6 +3654,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { selectImpl(I, *CoverageInfo)) return true; return selectG_SZA_EXT(I); + case TargetOpcode::G_FPEXT: + if (selectG_FPEXT(I)) + return true; + return selectImpl(I, *CoverageInfo); case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_GLOBAL_VALUE: @@ -3457,8 +3672,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { - const AMDGPU::ImageDimIntrinsicInfo *Intr - = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); + const AMDGPU::ImageDimIntrinsicInfo *Intr = + AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I)); assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } @@ -3472,6 +3687,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return true; case AMDGPU::G_AMDGPU_WAVE_ADDRESS: return selectWaveAddress(I); + case AMDGPU::G_STACKRESTORE: + return selectStackRestore(I); default: return selectImpl(I, *CoverageInfo); } @@ -3916,7 +4133,9 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, int64_t ConstOffset; std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant)) + + if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg()))) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -4079,7 +4298,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); - if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) && + if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { Addr = PtrBase; @@ -4113,7 +4332,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) .addFrameIndex(FI) - .addReg(RHSDef->Reg); + .addReg(RHSDef->Reg) + .setOperandDead(3); // Dead scc } } @@ -4155,6 +4375,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + Register OrigAddr = Addr; if (ConstOffset != 0 && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { Addr = PtrBase; @@ -4172,8 +4393,13 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); - if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS)) - return std::nullopt; + if (OrigAddr != Addr) { + if (!isFlatScratchBaseLegalSVImm(OrigAddr)) + return std::nullopt; + } else { + if (!isFlatScratchBaseLegalSV(OrigAddr)) + return std::nullopt; + } if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; @@ -4211,7 +4437,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { // TODO: Should this be inside the render function? The iterator seems to // move. - const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), HighBits) .addImm(Offset & ~MaxOffset); @@ -4243,7 +4469,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { int64_t ConstOffset; std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); if (ConstOffset != 0) { - if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && + if (TII.isLegalMUBUFImmOffset(ConstOffset) && (!STI.privateMemoryResourceIsRangeChecked() || KB->signBitIsZero(PtrBase))) { const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); @@ -4306,14 +4532,83 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, return KB->signBitIsZero(Base); } -bool AMDGPUInstructionSelector::isFlatScratchBaseLegal( - Register Base, uint64_t FlatVariant) const { - if (FlatVariant != SIInstrFlags::FlatScratch) +// Return whether the operation has NoUnsignedWrap property. +static bool isNoUnsignedWrap(MachineInstr *Addr) { + return Addr->getOpcode() == TargetOpcode::G_OR || + (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && + Addr->getFlag(MachineInstr::NoUWrap)); +} + +// Check that the base address of flat scratch load/store in the form of `base + +// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware +// requirement). We always treat the first operand as the base address here. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) return true; - // When value in 32-bit Base can be negative calculate scratch offset using - // 32-bit add instruction, otherwise use Base(unsigned) + offset. - return KB->signBitIsZero(Base); + // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative + // values. + if (AMDGPU::isGFX12Plus(STI)) + return true; + + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + + if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { + std::optional<ValueAndVReg> RhsValReg = + getIConstantVRegValWithLookThrough(RHS, *MRI); + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && + RhsValReg->Value.getSExtValue() > -0x40000000) + return true; + } + + return KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + + if (isNoUnsignedWrap(AddrMI)) + return true; + + Register LHS = AddrMI->getOperand(1).getReg(); + Register RHS = AddrMI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); +} + +// Check address value in SGPR/VGPR are legal for flat scratch in the form +// of: SGPR + VGPR + Imm. +bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( + Register Addr) const { + MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); + Register Base = AddrMI->getOperand(1).getReg(); + std::optional<DefinitionAndSourceRegister> BaseDef = + getDefSrcRegIgnoringCopies(Base, *MRI); + std::optional<ValueAndVReg> RHSOffset = + getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); + assert(RHSOffset); + + // If the immediate offset is negative and within certain range, the base + // address cannot also be negative. If the base is also negative, the sum + // would be either negative or much larger than the valid range of scratch + // memory a thread can access. + if (isNoUnsignedWrap(BaseDef->MI) && + (isNoUnsignedWrap(AddrMI) || + (RHSOffset->Value.getSExtValue() < 0 && + RHSOffset->Value.getSExtValue() > -0x40000000))) + return true; + + Register LHS = BaseDef->MI->getOperand(1).getReg(); + Register RHS = BaseDef->MI->getOperand(2).getReg(); + return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); } bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, @@ -4332,21 +4627,18 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; } -// Return the wave level SGPR base address if this is a wave address. -static Register getWaveAddress(const MachineInstr *Def) { - return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS - ? Def->getOperand(1).getReg() - : Register(); -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { Register Reg = Root.getReg(); const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - const MachineInstr *Def = MRI->getVRegDef(Reg); - if (Register WaveBase = getWaveAddress(Def)) { + std::optional<DefinitionAndSourceRegister> Def = + getDefSrcRegIgnoringCopies(Reg, *MRI); + assert(Def && "this shouldn't be an optional result"); + Reg = Def->Reg; + + if (Register WaveBase = getWaveAddress(Def->MI)) { return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); @@ -4362,10 +4654,12 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( // FIXME: Copy check is a hack Register BasePtr; - if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { - if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + if (mi_match(Reg, *MRI, + m_GPtrAdd(m_Reg(BasePtr), + m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) { + if (!TII.isLegalMUBUFImmOffset(Offset)) return {}; - const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); + MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI); Register WaveBase = getWaveAddress(BasePtrDef); if (!WaveBase) return {}; @@ -4382,7 +4676,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( } if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || - !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + !TII.isLegalMUBUFImmOffset(Offset)) return {}; return {{ @@ -4625,7 +4919,7 @@ bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { /// component. void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { - if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) + if (TII.isLegalMUBUFImmOffset(ImmOffset)) return; // Illegal offset, store it in soffset. @@ -4734,6 +5028,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { // soffset if (SOffset) MIB.addReg(SOffset); + else if (STI.hasRestrictedSOffset()) + MIB.addReg(AMDGPU::SGPR_NULL); else MIB.addImm(0); }, @@ -4762,6 +5058,8 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { // soffset if (SOffset) MIB.addReg(SOffset); + else if (STI.hasRestrictedSOffset()) + MIB.addReg(AMDGPU::SGPR_NULL); else MIB.addImm(0); }, @@ -4772,6 +5070,17 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const { + + Register SOffset = Root.getReg(); + + if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt())) + SOffset = AMDGPU::SGPR_NULL; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; +} + /// Get an immediate that must be 32-bits, and treated as zero extended. static std::optional<uint64_t> getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { @@ -4818,8 +5127,8 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { // an immediate offset. Register SOffset; unsigned Offset; - std::tie(SOffset, Offset) = - AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB); + std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset( + *MRI, Root.getReg(), KB, /*CheckNUW*/ true); if (!SOffset) return std::nullopt; @@ -4980,6 +5289,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { }}; } +bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; + + if (HasM0) { + auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(I.getOperand(2).getReg()); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); + if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) + return false; + } else { + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) + .addImm(I.getOperand(2).getImm()); + } + + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + +unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { + if (HasInlineConst) { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_IMM; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_IMM; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_IMM; + }; + } else { + switch (IntrID) { + default: + llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_init: + return AMDGPU::S_BARRIER_INIT_M0; + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; + case Intrinsic::amdgcn_s_wakeup_barrier: + return AMDGPU::S_WAKEUP_BARRIER_M0; + case Intrinsic::amdgcn_s_get_barrier_state: + return AMDGPU::S_GET_BARRIER_STATE_M0; + }; + } +} + +bool AMDGPUInstructionSelector::selectNamedBarrierInst( + MachineInstr &I, Intrinsic::ID IntrID) const { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state + ? I.getOperand(2) + : I.getOperand(1); + std::optional<int64_t> BarValImm = + getIConstantVRegSExtVal(BarOp.getReg(), *MRI); + Register M0Val; + Register TmpReg0; + + // For S_BARRIER_INIT, member count will always be read from M0[16:22] + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + Register MemberCount = I.getOperand(2).getReg(); + TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + // TODO: This should be expanded during legalization so that the the S_LSHL + // and S_OR can be constant-folded + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) + .addImm(16) + .addReg(MemberCount); + M0Val = TmpReg0; + } + + // If not inlinable, get reference to barrier depending on the instruction + if (!BarValImm) { + if (IntrID == Intrinsic::amdgcn_s_barrier_init) { + // If reference to barrier id is not an inlinable constant then it must be + // referenced with M0[4:0]. Perform an OR with the member count to include + // it in M0 for S_BARRIER_INIT. + Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) + .addReg(BarOp.getReg()) + .addReg(TmpReg0); + M0Val = TmpReg1; + } else { + M0Val = BarOp.getReg(); + } + } + + // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. + if (M0Val) { + auto CopyMIB = + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); + constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + } + + MachineInstrBuilder MIB; + unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); + + if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) + MIB.addDef(I.getOperand(0).getReg()); + + if (BarValImm) + MIB.addImm(*BarValImm); + + I.eraseFromParent(); + return true; +} +bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register CCReg = I.getOperand(0).getReg(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, + *MRI); +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { @@ -5037,14 +5475,19 @@ void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL); + MIB.addImm(MI.getOperand(OpIdx).getImm() & + (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12)); } void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(OpIdx >= 0 && "expected to match an immediate operand"); - MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); + const bool Swizzle = MI.getOperand(OpIdx).getImm() & + (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ + : AMDGPU::CPol::SWZ_pregfx12); + MIB.addImm(Swizzle); } void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, @@ -5057,7 +5500,16 @@ void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB, void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { - MIB.addFrameIndex((MI.getOperand(1).getIndex())); + MIB.addFrameIndex(MI.getOperand(1).getIndex()); +} + +void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF(); + int ExpVal = APF.getExactLog2Abs(); + assert(ExpVal != INT_MIN); + MIB.addImm(ExpVal); } bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 243ff72e2979..ab7cc0a6beb8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -90,6 +90,7 @@ private: bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; + bool selectG_FPEXT(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; bool selectG_FNEG(MachineInstr &I) const; bool selectG_FABS(MachineInstr &I) const; @@ -129,7 +130,7 @@ private: const AMDGPU::ImageDimIntrinsicInfo *Intr) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; - bool selectG_ICMP(MachineInstr &I) const; + bool selectG_ICMP_or_FCMP(MachineInstr &I) const; bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const; @@ -147,6 +148,10 @@ private: bool selectBVHIntrinsic(MachineInstr &I) const; bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; + bool selectStackRestore(MachineInstr &MI) const; + bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierLeave(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, bool IsCanonicalizing = true, @@ -241,8 +246,9 @@ private: bool isDSOffsetLegal(Register Base, int64_t Offset) const; bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1, unsigned Size) const; - bool isFlatScratchBaseLegal( - Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + bool isFlatScratchBaseLegal(Register Addr) const; + bool isFlatScratchBaseLegalSV(Register Addr) const; + bool isFlatScratchBaseLegalSVImm(Register Addr) const; std::pair<Register, unsigned> selectDS1Addr1OffsetImpl(MachineOperand &Root) const; @@ -287,6 +293,9 @@ private: Register &SOffset, int64_t &Offset) const; InstructionSelector::ComplexRendererFns + selectBUFSOffset(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectMUBUFAddr64(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns @@ -328,6 +337,9 @@ private: void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + bool isInlineImmediate16(int64_t Imm) const; bool isInlineImmediate32(int64_t Imm) const; bool isInlineImmediate64(int64_t Imm) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 2305097e3f94..eaf72d7157ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -125,11 +125,11 @@ def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def i1imm_0 : OperandWithDefaultOps<i1, (ops (i1 0))>; -class CustomOperandClass<string name, bit optional, string parserMethod, - string defaultMethod> +class CustomOperandClass<string name, bit optional, string predicateMethod, + string parserMethod, string defaultMethod> : AsmOperandClass { let Name = name; - let PredicateMethod = "is"#name; + let PredicateMethod = predicateMethod; let ParserMethod = parserMethod; let RenderMethod = "addImmOperands"; let IsOptional = optional; @@ -138,6 +138,7 @@ class CustomOperandClass<string name, bit optional, string parserMethod, class CustomOperandProps<bit optional = 0, string name = NAME> { string ImmTy = "ImmTy"#name; + string PredicateMethod = "is"#name; string ParserMethod = "parse"#name; string DefaultValue = "0"; string DefaultMethod = "[this]() { return "# @@ -145,7 +146,8 @@ class CustomOperandProps<bit optional = 0, string name = NAME> { "AMDGPUOperand::"#ImmTy#"); }"; string PrintMethod = "print"#name; AsmOperandClass ParserMatchClass = - CustomOperandClass<name, optional, ParserMethod, DefaultMethod>; + CustomOperandClass<name, optional, PredicateMethod, ParserMethod, + DefaultMethod>; string OperandType = "OPERAND_IMMEDIATE"; } @@ -163,6 +165,20 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0, def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">; +class ValuePredicatedOperand<CustomOperand op, string valuePredicate, + bit optional = 0> + : CustomOperand<op.Type, optional> { + let ImmTy = op.ImmTy; + defvar OpPredicate = op.ParserMatchClass.PredicateMethod; + let PredicateMethod = + "getPredicate([](const AMDGPUOperand &Op) -> bool { "# + "return Op."#OpPredicate#"() && "#valuePredicate#"; })"; + let ParserMethod = op.ParserMatchClass.ParserMethod; + let DefaultValue = op.DefaultValue; + let DefaultMethod = op.DefaultMethod; + let PrintMethod = op.PrintMethod; +} + //===--------------------------------------------------------------------===// // Custom Operands //===--------------------------------------------------------------------===// @@ -236,6 +252,8 @@ def umin_oneuse : HasOneUseBinOp<umin>; def fminnum_oneuse : HasOneUseBinOp<fminnum>; def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>; +def fminimum_oneuse : HasOneUseBinOp<fminimum>; +def fmaximum_oneuse : HasOneUseBinOp<fmaximum>; def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>; def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>; @@ -544,19 +562,18 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), def store_hi16_#as : StoreHi16 <truncstorei16, i16>; def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>; def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>; - } // End let IsStore = 1, AddressSpaces = ... let IsAtomic = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in { -def atomic_store_8_#as : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_8 node:$ptr, node:$val)>; -def atomic_store_16_#as : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_16 node:$ptr, node:$val)>; -def atomic_store_32_#as : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_32 node:$ptr, node:$val)>; -def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_64 node:$ptr, node:$val)>; -} +def atomic_store_8_#as : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_8 node:$val, node:$ptr)>; +def atomic_store_16_#as : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_16 node:$val, node:$ptr)>; +def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_32 node:$val, node:$ptr)>; +def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_64 node:$val, node:$ptr)>; +} // End let IsAtomic = 1, AddressSpaces = ... } // End foreach as multiclass noret_op { @@ -622,8 +639,13 @@ defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; +defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_fmin_num : noret_op; +defm int_amdgcn_flat_atomic_fmax_num : noret_op; +defm int_amdgcn_global_atomic_fmin_num : noret_op; +defm int_amdgcn_global_atomic_fmax_num : noret_op; multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { let HasNoUse = true in diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index bbf4db12f5ab..fbee28889451 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -17,14 +17,19 @@ #include "AMDGPUGlobalISelUtils.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -455,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, return false; // If we have 96-bit memory operations, we shouldn't touch them. Note we may - // end up widening these for a scalar load during RegBankSelect, since there - // aren't 96-bit scalar loads. + // end up widening these for a scalar load during RegBankSelect, if we don't + // have 96-bit scalar loads. if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) return false; @@ -628,6 +633,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); + const LLT BufferStridedPtr = + GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); const LLT CodePtr = FlatPtr; @@ -676,13 +683,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. - getActionDefinitionsBuilder({G_ADD, G_SUB}) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32); + if (ST.hasScalarAddSub64()) { + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({S64, S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + } else { + getActionDefinitionsBuilder({G_ADD, G_SUB}) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + } getActionDefinitionsBuilder(G_MUL) .legalFor({S32, S16, V2S16}) @@ -842,6 +859,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_DYN_STACKALLOC) .legalFor({{PrivatePtr, S32}}); + getActionDefinitionsBuilder(G_STACKSAVE) + .customFor({PrivatePtr}); + getActionDefinitionsBuilder(G_STACKRESTORE) + .legalFor({PrivatePtr}); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) .customIf(typeIsNot(0, PrivatePtr)); @@ -866,6 +888,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FDIVActions.customFor({S16}); } + if (ST.hasPackedFP32Ops()) { + FPOpActions.legalFor({V2S32}); + FPOpActions.clampMaxNumElementsStrict(0, S32, 2); + } + auto &MinNumMaxNum = getActionDefinitionsBuilder({ G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); @@ -908,10 +935,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32, S16}) - .customFor({S64}) + .legalFor({S16}) + .customFor({S32, S64}) .scalarize(0) - .clampScalar(0, S16, S64); + .unsupported(); getActionDefinitionsBuilder(G_FFLOOR) .legalFor({S32, S64, S16}) .scalarize(0) @@ -930,10 +957,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32}) - .customFor({S64}) + .customFor({S32, S64, S16}) .scalarize(0) - .clampScalar(0, S32, S64); + .unsupported(); + if (ST.hasFractBug()) { getActionDefinitionsBuilder(G_FFLOOR) @@ -1061,31 +1088,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); - // Lower roundeven into G_FRINT - getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) - .scalarize(0) - .lower(); + // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN + getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) + .scalarize(0) + .lower(); if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) - .legalFor({S16, S32, S64}) - .clampScalar(0, S16, S64) - .scalarize(0); + getActionDefinitionsBuilder( + {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) + .legalFor({S16, S32, S64}) + .clampScalar(0, S16, S64) + .scalarize(0); } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { - getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) - .legalFor({S32, S64}) - .clampScalar(0, S32, S64) - .scalarize(0); + getActionDefinitionsBuilder( + {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0); } else { - getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) - .legalFor({S32}) - .customFor({S64}) - .clampScalar(0, S32, S64) - .scalarize(0); + getActionDefinitionsBuilder( + {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) + .legalFor({S32}) + .customFor({S64}) + .clampScalar(0, S32, S64) + .scalarize(0); } getActionDefinitionsBuilder(G_PTR_ADD) - .unsupportedFor({BufferFatPtr, RsrcPtr}) + .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) .legalIf(all(isPointer(0), sameSize(0, 1))) .scalarize(0) .scalarSameSizeAs(1, 0); @@ -1121,8 +1151,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); - getActionDefinitionsBuilder(G_FCMP) - .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) + auto &FCmpBuilder = + getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( + {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); + + if (ST.hasSALUFloatInsts()) + FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); + + FCmpBuilder .widenScalarToNextPow2(1) .clampScalar(1, S32, S64) .scalarize(0); @@ -1149,7 +1185,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Log2Ops.scalarize(0) .lower(); - auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP}); + auto &LogOps = + getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); LogOps.customFor({S32, S16}); LogOps.clampScalar(0, MinScalarFPTy, S32) .scalarize(0); @@ -1219,7 +1256,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) .legalFor({S32, S16, V2S16}) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .clampMaxNumElements(0, S16, 2) .minScalar(0, S16) .widenScalarToNextPow2(0) @@ -1369,7 +1405,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // The custom pointers (fat pointers, buffer resources) don't work with load // and store at this level. Fat pointers should have been lowered to // intrinsics before the translation to MIR. - Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); + Actions.unsupportedIf( + typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); // Address space 8 pointers are handled by a 4xs32 load, bitcast, and // ptrtoint. This is needed to account for the fact that we can't have i128 @@ -1925,20 +1962,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0) .scalarize(0); - getActionDefinitionsBuilder({ - // TODO: Verify V_BFI_B32 is generated from expanded bit ops - G_FCOPYSIGN, + getActionDefinitionsBuilder( + {// TODO: Verify V_BFI_B32 is generated from expanded bit ops + G_FCOPYSIGN, - G_ATOMIC_CMPXCHG_WITH_SUCCESS, - G_ATOMICRMW_NAND, - G_ATOMICRMW_FSUB, - G_READ_REGISTER, - G_WRITE_REGISTER, + G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, + G_READ_REGISTER, G_WRITE_REGISTER, - G_SADDO, G_SSUBO, + G_SADDO, G_SSUBO}) + .lower(); - // TODO: Implement - G_FMINIMUM, G_FMAXIMUM}).lower(); + if (ST.hasIEEEMinMax()) { + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .legalFor(FPTypesPK16) + .clampMaxNumElements(0, S16, 2) + .scalarize(0); + } else { + // TODO: Implement + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + } getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) .lower(); @@ -1948,6 +1990,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); + getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } @@ -1960,8 +2004,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: return legalizeAddrSpaceCast(MI, MRI, B); - case TargetOpcode::G_FRINT: - return legalizeFrint(MI, MRI, B); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: + return legalizeFroundeven(MI, MRI, B); case TargetOpcode::G_FCEIL: return legalizeFceil(MI, MRI, B); case TargetOpcode::G_FREM: @@ -2022,6 +2066,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, case TargetOpcode::G_FEXP2: return legalizeFExp2(MI, B); case TargetOpcode::G_FEXP: + case TargetOpcode::G_FEXP10: return legalizeFExp(MI, B); case TargetOpcode::G_FPOW: return legalizeFPow(MI, B); @@ -2037,6 +2082,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeCTLZ_CTTZ(MI, MRI, B); case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: return legalizeFPTruncRound(MI, B); + case TargetOpcode::G_STACKSAVE: + return legalizeStackSave(MI, B); default: return false; } @@ -2264,9 +2311,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } -bool AMDGPULegalizerInfo::legalizeFrint( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { Register Src = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Src); assert(Ty.isScalar() && Ty.getSizeInBits() == 64); @@ -2345,10 +2392,10 @@ static MachineInstrBuilder extractF64Exponent(Register Hi, auto Const0 = B.buildConstant(S32, FractBits - 32); auto Const1 = B.buildConstant(S32, ExpBits); - auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) - .addUse(Hi) - .addUse(Const0.getReg(0)) - .addUse(Const1.getReg(0)); + auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) + .addUse(Hi) + .addUse(Const0.getReg(0)) + .addUse(Const1.getReg(0)); return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); } @@ -2436,8 +2483,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP( auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); - auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, - /*HasSideEffects=*/false) + auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) .addUse(Unmerge.getReg(1)); auto LS2 = B.buildSub(S32, LS, One); ShAmt = B.buildUMin(S32, LS2, MaxShAmt); @@ -2670,15 +2716,16 @@ bool AMDGPULegalizerInfo::legalizeSinCos( auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); if (ST.hasTrigReducedRange()) { auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); - TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) - .addUse(MulVal.getReg(0)) - .setMIFlags(Flags).getReg(0); + TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) + .addUse(MulVal.getReg(0)) + .setMIFlags(Flags) + .getReg(0); } else TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; - B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) + B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) .addUse(TrigVal) .setMIFlags(Flags); MI.eraseFromParent(); @@ -2714,15 +2761,6 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. - // - // What we want here is an offset from the value returned by s_getpc - // (which is the address of the s_add_u32 instruction) to the global - // variable, but since the encoding of $symbol starts 4 bytes after the start - // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too - // small. This requires us to add 4 to the global variable offset in order to - // compute the correct address. Similarly for the s_addc_u32 instruction, the - // encoding of $symbol starts 12 bytes after the start of the s_add_u32 - // instruction. LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -2732,11 +2770,11 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) .addDef(PCReg); - MIB.addGlobalAddress(GV, Offset + 4, GAFlags); + MIB.addGlobalAddress(GV, Offset, GAFlags); if (GAFlags == SIInstrInfo::MO_NONE) MIB.addImm(0); else - MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); + MIB.addGlobalAddress(GV, Offset, GAFlags + 1); if (!B.getMRI()->getRegClassOrNull(PCReg)) B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); @@ -2744,7 +2782,63 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, if (PtrTy.getSizeInBits() == 32) B.buildExtract(DstReg, PCReg, 0); return true; - } +} + +// Emit a ABS32_LO / ABS32_HI relocation stub. +void AMDGPULegalizerInfo::buildAbsGlobalAddress( + Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, + MachineRegisterInfo &MRI) const { + bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; + + LLT S32 = LLT::scalar(32); + + // Use the destination directly, if and only if we store the lower address + // part only and we don't have a register class being set. + Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) + ? DstReg + : MRI.createGenericVirtualRegister(S32); + + if (!MRI.getRegClassOrNull(AddrLo)) + MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); + + // Write the lower half. + B.buildInstr(AMDGPU::S_MOV_B32) + .addDef(AddrLo) + .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); + + // If required, write the upper half as well. + if (RequiresHighHalf) { + assert(PtrTy.getSizeInBits() == 64 && + "Must provide a 64-bit pointer type!"); + + Register AddrHi = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); + + B.buildInstr(AMDGPU::S_MOV_B32) + .addDef(AddrHi) + .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); + + // Use the destination directly, if and only if we don't have a register + // class being set. + Register AddrDst = !MRI.getRegClassOrNull(DstReg) + ? DstReg + : MRI.createGenericVirtualRegister(LLT::scalar(64)); + + if (!MRI.getRegClassOrNull(AddrDst)) + MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); + + B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); + + // If we created a new register for the destination, cast the result into + // the final output. + if (AddrDst != DstReg) + B.buildCast(DstReg, AddrDst); + } else if (AddrLo != DstReg) { + // If we created a new register for the destination, cast the result into + // the final output. + B.buildCast(DstReg, AddrLo); + } +} bool AMDGPULegalizerInfo::legalizeGlobalValue( MachineInstr &MI, MachineRegisterInfo &MRI, @@ -2771,7 +2865,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( // functions that use local objects. However, if these dead functions are // not eliminated, we don't want a compile time error. Just emit a warning // and a trap, since there should be no callable path here. - B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); + B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>()); B.buildUndef(DstReg); MI.eraseFromParent(); return true; @@ -2797,8 +2891,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( // Adjust alignment for that dynamic shared memory array. MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); LLT S32 = LLT::scalar(32); - auto Sz = - B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); + auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); B.buildIntToPtr(DstReg, Sz); MI.eraseFromParent(); return true; @@ -2811,6 +2904,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( return true; } + if (ST.isAmdPalOS() || ST.isMesa3DOS()) { + buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); + MI.eraseFromParent(); + return true; + } + const SITargetLowering *TLI = ST.getTargetLowering(); if (TLI->shouldEmitFixup(GV)) { @@ -2973,10 +3072,10 @@ bool AMDGPULegalizerInfo::legalizeFMad( // TODO: Always legal with future ftz flag. // FIXME: Do we need just output? - if (Ty == LLT::scalar(32) && + if (Ty == LLT::float32() && MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) return true; - if (Ty == LLT::scalar(16) && + if (Ty == LLT::float16() && MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) return true; @@ -3014,9 +3113,30 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( /// Return true if it's known that \p Src can never be an f32 denormal value. static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, Register Src) { - Register ExtSrc; - if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) - return MRI.getType(ExtSrc) == LLT::scalar(16); + const MachineInstr *DefMI = MRI.getVRegDef(Src); + switch (DefMI->getOpcode()) { + case TargetOpcode::G_INTRINSIC: { + switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { + case Intrinsic::amdgcn_frexp_mant: + return true; + default: + break; + } + + break; + } + case TargetOpcode::G_FFREXP: { + if (DefMI->getOperand(0).getReg() == Src) + return true; + break; + } + case TargetOpcode::G_FPEXT: { + return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); + } + default: + return false; + } + return false; } @@ -3072,9 +3192,9 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, const LLT F32 = LLT::scalar(32); // Nothing in half is a denormal when promoted to f32. auto Ext = B.buildFPExt(F32, Src, Flags); - auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) - .addUse(Ext.getReg(0)) - .setMIFlags(Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); B.buildFPTrunc(Dst, Log2, Flags); MI.eraseFromParent(); return true; @@ -3084,14 +3204,14 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); if (!ScaledInput) { - B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false) + B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) .addUse(Src) .setMIFlags(Flags); MI.eraseFromParent(); return true; } - auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) .addUse(ScaledInput) .setMIFlags(Flags); @@ -3148,9 +3268,8 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, if (ScaledInput) X = ScaledInput; - auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) - .addUse(X) - .setMIFlags(Flags); + auto Y = + B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); Register R; if (ST.hasFastFMAF32()) { @@ -3231,7 +3350,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, if (Ty == LLT::scalar(32)) { auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); if (ScaledInput) { - auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) .addUse(Src) .setMIFlags(Flags); auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); @@ -3253,7 +3372,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, auto Log2Operand = Ty == LLT::scalar(16) ? B.buildFLog2(Ty, Src, Flags) - : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) .addUse(Src) .setMIFlags(Flags); auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); @@ -3276,9 +3395,9 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, if (Ty == F16) { // Nothing in half is a denormal when promoted to f32. auto Ext = B.buildFPExt(F32, Src, Flags); - auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) - .addUse(Ext.getReg(0)) - .setMIFlags(Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); B.buildFPTrunc(Dst, Log2, Flags); MI.eraseFromParent(); return true; @@ -3287,7 +3406,7 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, assert(Ty == F32); if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { - B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) + B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) .addUse(Src) .setMIFlags(Flags); MI.eraseFromParent(); @@ -3307,7 +3426,7 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); - auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) .addUse(AddInput.getReg(0)) .setMIFlags(Flags); @@ -3320,20 +3439,42 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, } bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, - Register Src, - unsigned Flags) const { + Register X, unsigned Flags) const { LLT Ty = B.getMRI()->getType(Dst); - auto K = B.buildFConstant(Ty, numbers::log2e); - auto Mul = B.buildFMul(Ty, Src, K, Flags); + LLT F32 = LLT::scalar(32); - if (Ty == LLT::scalar(32)) { - B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) - .addUse(Mul.getReg(0)) - .setMIFlags(Flags); - } else { - B.buildFExp2(Dst, Mul.getReg(0), Flags); + if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { + auto Log2E = B.buildFConstant(Ty, numbers::log2e); + auto Mul = B.buildFMul(Ty, X, Log2E, Flags); + + if (Ty == F32) { + B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) + .addUse(Mul.getReg(0)) + .setMIFlags(Flags); + } else { + B.buildFExp2(Dst, Mul.getReg(0), Flags); + } + + return true; } + auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); + auto NeedsScaling = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); + auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); + auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); + auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); + + auto Log2E = B.buildFConstant(Ty, numbers::log2e); + auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); + + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) + .addUse(ExpInput.getReg(0)) + .setMIFlags(Flags); + + auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); + auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); + B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); return true; } @@ -3347,7 +3488,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, LLT Ty = MRI.getType(Dst); const LLT F16 = LLT::scalar(16); const LLT F32 = LLT::scalar(32); - const bool IsExp10 = false; // TODO: For some reason exp10 is missing + const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; if (Ty == F16) { // v_exp_f16 (fmul x, log2e) @@ -3374,7 +3515,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying // library behavior. Also, is known-not-daz source sufficient? - if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) { + if (allowApproxFunc(MF, Flags)) { legalizeFExpUnsafe(B, Dst, X, Flags); MI.eraseFromParent(); return true; @@ -3442,14 +3583,14 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); } - auto E = B.buildFRint(Ty, PH, Flags); + auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); // It is unsafe to contract this fsub into the PH multiply. auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); auto IntE = B.buildFPTOSI(LLT::scalar(32), E); - auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) .addUse(A.getReg(0)) .setMIFlags(Flags); auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); @@ -3486,27 +3627,26 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, Register Src1 = MI.getOperand(2).getReg(); unsigned Flags = MI.getFlags(); LLT Ty = B.getMRI()->getType(Dst); - const LLT S16 = LLT::scalar(16); - const LLT S32 = LLT::scalar(32); - - if (Ty == S32) { - auto Log = B.buildFLog2(S32, Src0, Flags); - auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) - .addUse(Log.getReg(0)) - .addUse(Src1) - .setMIFlags(Flags); + const LLT F16 = LLT::float16(); + const LLT F32 = LLT::float32(); + + if (Ty == F32) { + auto Log = B.buildFLog2(F32, Src0, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) + .addUse(Log.getReg(0)) + .addUse(Src1) + .setMIFlags(Flags); B.buildFExp2(Dst, Mul, Flags); - } else if (Ty == S16) { + } else if (Ty == F16) { // There's no f16 fmul_legacy, so we need to convert for it. - auto Log = B.buildFLog2(S16, Src0, Flags); - auto Ext0 = B.buildFPExt(S32, Log, Flags); - auto Ext1 = B.buildFPExt(S32, Src1, Flags); - auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) - .addUse(Ext0.getReg(0)) - .addUse(Ext1.getReg(0)) - .setMIFlags(Flags); - - B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); + auto Log = B.buildFLog2(F16, Src0, Flags); + auto Ext0 = B.buildFPExt(F32, Log, Flags); + auto Ext1 = B.buildFPExt(F32, Src1, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) + .addUse(Ext0.getReg(0)) + .addUse(Ext1.getReg(0)) + .setMIFlags(Flags); + B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); } else return false; @@ -3531,11 +3671,11 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, MachineIRBuilder &B) const { const LLT S1 = LLT::scalar(1); - const LLT S64 = LLT::scalar(64); + const LLT F64 = LLT::float64(); Register Dst = MI.getOperand(0).getReg(); Register OrigSrc = MI.getOperand(1).getReg(); unsigned Flags = MI.getFlags(); - assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && + assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && "this should not have been custom lowered"); // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) @@ -3546,9 +3686,9 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, // // Convert floor(x) to (x - fract(x)) - auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) - .addUse(OrigSrc) - .setMIFlags(Flags); + auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) + .addUse(OrigSrc) + .setMIFlags(Flags); // Give source modifier matching some assistance before obscuring a foldable // pattern. @@ -3558,9 +3698,9 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, Register ModSrc = stripAnySourceMods(OrigSrc, MRI); auto Const = - B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); + B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); - Register Min = MRI.createGenericVirtualRegister(S64); + Register Min = MRI.createGenericVirtualRegister(F64); // We don't need to concern ourselves with the snan handling difference, so // use the one which will directly select. @@ -3573,10 +3713,10 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, Register CorrectedFract = Min; if (!MI.getFlag(MachineInstr::FmNoNans)) { auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); - CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); + CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); } - auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); + auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); B.buildFAdd(Dst, OrigSrc, NegFract, Flags); MI.eraseFromParent(); @@ -4497,38 +4637,36 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, // 1 / x -> RCP(x) if (CLHS->isExactlyValue(1.0)) { - B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) - .addUse(RHS) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) + .addUse(RHS) + .setMIFlags(Flags); MI.eraseFromParent(); return true; } - // TODO: Match rsq - // -1 / x -> RCP( FNEG(x) ) if (CLHS->isExactlyValue(-1.0)) { auto FNeg = B.buildFNeg(ResTy, RHS, Flags); - B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) - .addUse(FNeg.getReg(0)) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) + .addUse(FNeg.getReg(0)) + .setMIFlags(Flags); MI.eraseFromParent(); return true; } } - // For f16 require arcp only. - // For f32 require afn+arcp. + // For f16 require afn or arcp. + // For f32 require afn. if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || !MI.getFlag(MachineInstr::FmArcp))) return false; // x / y -> x * (1.0 / y) - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) - .addUse(RHS) - .setMIFlags(Flags); + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) + .addUse(RHS) + .setMIFlags(Flags); B.buildFMul(Res, LHS, RCP, Flags); MI.eraseFromParent(); @@ -4554,9 +4692,9 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, auto NegY = B.buildFNeg(ResTy, Y); auto One = B.buildFConstant(ResTy, 1.0); - auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) - .addUse(Y) - .setMIFlags(Flags); + auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) + .addUse(Y) + .setMIFlags(Flags); auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); R = B.buildFMA(ResTy, Tmp0, R, R); @@ -4590,23 +4728,27 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, auto LHSExt = B.buildFPExt(S32, LHS, Flags); auto RHSExt = B.buildFPExt(S32, RHS, Flags); - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) - .addUse(RHSExt.getReg(0)) - .setMIFlags(Flags); + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) + .addUse(RHSExt.getReg(0)) + .setMIFlags(Flags); auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); auto RDst = B.buildFPTrunc(S16, QUOT, Flags); - B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) - .addUse(RDst.getReg(0)) - .addUse(RHS) - .addUse(LHS) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) + .addUse(RDst.getReg(0)) + .addUse(RHS) + .addUse(LHS) + .setMIFlags(Flags); MI.eraseFromParent(); return true; } +static const unsigned SPDenormModeBitField = + AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions // to enable denorm mode. When 'Enable' is false, disable denorm mode. static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, @@ -4625,11 +4767,6 @@ static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, .addImm(NewDenormModeValue); } else { - // Select FP32 bit field in mode register. - unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | - (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | - (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) .addImm(SPDenormMode) .addImm(SPDenormModeBitField); @@ -4656,27 +4793,38 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto One = B.buildFConstant(S32, 1.0f); auto DenominatorScaled = - B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) - .addUse(LHS) - .addUse(RHS) - .addImm(0) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) + .addUse(LHS) + .addUse(RHS) + .addImm(0) + .setMIFlags(Flags); auto NumeratorScaled = - B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) - .addUse(LHS) - .addUse(RHS) - .addImm(1) - .setMIFlags(Flags); - - auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) - .addUse(DenominatorScaled.getReg(0)) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) + .addUse(LHS) + .addUse(RHS) + .addImm(1) + .setMIFlags(Flags); + + auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) + .addUse(DenominatorScaled.getReg(0)) + .setMIFlags(Flags); auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); - // FIXME: Doesn't correctly model the FP mode switch, and the FP operations - // aren't modeled as reading it. - if (Mode.FP32Denormals != DenormalMode::getIEEE()) + const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); + const bool HasDynamicDenormals = + (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || + (Mode.FP32Denormals.Output == DenormalMode::Dynamic); + + Register SavedSPDenormMode; + if (!PreservesDenormals) { + if (HasDynamicDenormals) { + SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + B.buildInstr(AMDGPU::S_GETREG_B32) + .addDef(SavedSPDenormMode) + .addImm(SPDenormModeBitField); + } toggleSPDenormMode(true, B, ST, Mode); + } auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); @@ -4685,23 +4833,28 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); - // FIXME: This mishandles dynamic denormal mode. We need to query the - // current mode and restore the original. - if (Mode.FP32Denormals != DenormalMode::getIEEE()) - toggleSPDenormMode(false, B, ST, Mode); + if (!PreservesDenormals) { + if (HasDynamicDenormals) { + assert(SavedSPDenormMode); + B.buildInstr(AMDGPU::S_SETREG_B32) + .addReg(SavedSPDenormMode) + .addImm(SPDenormModeBitField); + } else + toggleSPDenormMode(false, B, ST, Mode); + } - auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) - .addUse(Fma4.getReg(0)) - .addUse(Fma1.getReg(0)) - .addUse(Fma3.getReg(0)) - .addUse(NumeratorScaled.getReg(1)) - .setMIFlags(Flags); + auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) + .addUse(Fma4.getReg(0)) + .addUse(Fma1.getReg(0)) + .addUse(Fma3.getReg(0)) + .addUse(NumeratorScaled.getReg(1)) + .setMIFlags(Flags); - B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) - .addUse(Fmas.getReg(0)) - .addUse(RHS) - .addUse(LHS) - .setMIFlags(Flags); + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) + .addUse(Fmas.getReg(0)) + .addUse(RHS) + .addUse(LHS) + .setMIFlags(Flags); MI.eraseFromParent(); return true; @@ -4724,27 +4877,27 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, auto One = B.buildFConstant(S64, 1.0); - auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) - .addUse(LHS) - .addUse(RHS) - .addImm(0) - .setMIFlags(Flags); + auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) + .addUse(LHS) + .addUse(RHS) + .addImm(0) + .setMIFlags(Flags); auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); - auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) - .addUse(DivScale0.getReg(0)) - .setMIFlags(Flags); + auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) + .addUse(DivScale0.getReg(0)) + .setMIFlags(Flags); auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); - auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) - .addUse(LHS) - .addUse(RHS) - .addImm(1) - .setMIFlags(Flags); + auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) + .addUse(LHS) + .addUse(RHS) + .addImm(1) + .setMIFlags(Flags); auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); @@ -4771,14 +4924,14 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, Scale = DivScale1.getReg(1); } - auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) - .addUse(Fma4.getReg(0)) - .addUse(Fma3.getReg(0)) - .addUse(Mul.getReg(0)) - .addUse(Scale) - .setMIFlags(Flags); + auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) + .addUse(Fma4.getReg(0)) + .addUse(Fma3.getReg(0)) + .addUse(Mul.getReg(0)) + .addUse(Scale) + .setMIFlags(Flags); - B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) .addUse(Fmas.getReg(0)) .addUse(RHS) .addUse(LHS) @@ -4799,10 +4952,10 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, LLT Ty = MRI.getType(Res0); LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); - auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false) + auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) .addUse(Val) .setMIFlags(Flags); - auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false) + auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) .addUse(Val) .setMIFlags(Flags); @@ -4846,9 +4999,9 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) - .addUse(Mul0.getReg(0)) - .setMIFlags(Flags); + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) + .addUse(Mul0.getReg(0)) + .setMIFlags(Flags); auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); @@ -4858,9 +5011,107 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Bypass the correct expansion a standard promotion through G_FSQRT would + // get. The f32 op is accurate enough for the f16 cas. + unsigned Flags = MI.getFlags(); + assert(!ST.has16BitInsts()); + const LLT F32 = LLT::scalar(32); + auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(MI.getOperand(0), Log2, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + const unsigned Flags = MI.getFlags(); + const LLT S1 = LLT::scalar(1); + const LLT F32 = LLT::scalar(32); + const LLT I32 = LLT::scalar(32); + + if (allowApproxFunc(MF, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) + .addUse(X) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); + auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); + auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); + auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); + auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); + + Register SqrtS = MRI.createGenericVirtualRegister(F32); + if (needsDenormHandlingF32(MF, X, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) + .addUse(SqrtX.getReg(0)) + .setMIFlags(Flags); + + auto NegOne = B.buildConstant(I32, -1); + auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); + + auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); + auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + + auto PosOne = B.buildConstant(I32, 1); + auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); + + auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); + auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + + auto Zero = B.buildFConstant(F32, 0.0f); + auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); + + SqrtS = + B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); + + auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); + SqrtS = + B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); + } else { + auto SqrtR = + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); + B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); + + auto Half = B.buildFConstant(F32, 0.5f); + auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); + auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); + auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); + SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); + SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); + auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); + auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); + SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); + } + + auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); + + auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); + + SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); + + auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); + B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { // For double type, the SQRT and RSQ instructions don't have required // precision, we apply Goldschmidt's algorithm to improve the result: // @@ -4901,8 +5152,8 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); - auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) - .addReg(SqrtX.getReg(0)); + auto SqrtY = + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); auto Half = B.buildFConstant(F64, 0.5); auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); @@ -4942,6 +5193,19 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty == LLT::scalar(32)) + return legalizeFSQRTF32(MI, MRI, B); + if (Ty == LLT::scalar(64)) + return legalizeFSQRTF64(MI, MRI, B); + if (Ty == LLT::scalar(16)) + return legalizeFSQRTF16(MI, MRI, B); + return false; +} + // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. // FIXME: Why do we handle this one but not other removed instructions? // @@ -4968,9 +5232,9 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, else return false; - auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) - .addUse(Src) - .setMIFlags(Flags); + auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) + .addUse(Src) + .setMIFlags(Flags); // We don't need to concern ourselves with the snan handling difference, since // the rsq quieted (or not) so use the one which will directly select. @@ -5153,7 +5417,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, std::pair<Register, unsigned> AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -5631,31 +5895,23 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; - const bool HasReturn = MI.getNumExplicitDefs() != 0; - - Register Dst; - - int OpOffset = 0; - if (HasReturn) { - // A few FP atomics do not support return values. - Dst = MI.getOperand(0).getReg(); - } else { - OpOffset = -1; - } + Register Dst = MI.getOperand(0).getReg(); // Since we don't have 128-bit atomics, we don't need to handle the case of // p8 argmunents to the atomic itself - Register VData = MI.getOperand(2 + OpOffset).getReg(); + Register VData = MI.getOperand(2).getReg(); + Register CmpVal; + int OpOffset = 0; if (IsCmpSwap) { - CmpVal = MI.getOperand(3 + OpOffset).getReg(); + CmpVal = MI.getOperand(3).getReg(); ++OpOffset; } castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); Register RSrc = MI.getOperand(3 + OpOffset).getReg(); - const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; + const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; // The struct intrinsic variants add one additional operand over raw. const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; @@ -5676,12 +5932,9 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, unsigned ImmOffset; std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); - auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); - - if (HasReturn) - MIB.addDef(Dst); - - MIB.addUse(VData); // vdata + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) + .addDef(Dst) + .addUse(VData); // vdata if (IsCmpSwap) MIB.addReg(CmpVal); @@ -5903,53 +6156,52 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return false; } - const unsigned NSAMaxSize = ST.getNSAMaxSize(); + const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); if (IsA16 || IsG16) { - if (Intr->NumVAddrs > 1) { - SmallVector<Register, 4> PackedRegs; + // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the + // instructions expect VGPR_32 + SmallVector<Register, 4> PackedRegs; - packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, - IsG16); - - // See also below in the non-a16 branch - const bool UseNSA = ST.hasNSAEncoding() && - PackedRegs.size() >= ST.getNSAThreshold(MF) && - (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); - const bool UsePartialNSA = - UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; - - if (UsePartialNSA) { - // Pack registers that would go over NSAMaxSize into last VAddr register - LLT PackedAddrTy = - LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); - auto Concat = B.buildConcatVectors( - PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); - PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); - PackedRegs.resize(NSAMaxSize); - } else if (!UseNSA && PackedRegs.size() > 1) { - LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); - auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); - PackedRegs[0] = Concat.getReg(0); - PackedRegs.resize(1); - } + packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); - const unsigned NumPacked = PackedRegs.size(); - for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { - MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); - if (!SrcOp.isReg()) { - assert(SrcOp.isImm() && SrcOp.getImm() == 0); - continue; - } + // See also below in the non-a16 branch + const bool UseNSA = ST.hasNSAEncoding() && + PackedRegs.size() >= ST.getNSAThreshold(MF) && + (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); + const bool UsePartialNSA = + UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; - assert(SrcOp.getReg() != AMDGPU::NoRegister); + if (UsePartialNSA) { + // Pack registers that would go over NSAMaxSize into last VAddr register + LLT PackedAddrTy = + LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); + auto Concat = B.buildConcatVectors( + PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); + PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); + PackedRegs.resize(NSAMaxSize); + } else if (!UseNSA && PackedRegs.size() > 1) { + LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); + auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); + PackedRegs[0] = Concat.getReg(0); + PackedRegs.resize(1); + } - if (I - Intr->VAddrStart < NumPacked) - SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); - else - SrcOp.setReg(AMDGPU::NoRegister); + const unsigned NumPacked = PackedRegs.size(); + for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { + MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); + if (!SrcOp.isReg()) { + assert(SrcOp.isImm() && SrcOp.getImm() == 0); + continue; } + + assert(SrcOp.getReg() != AMDGPU::NoRegister); + + if (I - Intr->VAddrStart < NumPacked) + SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); + else + SrcOp.setReg(AMDGPU::NoRegister); } } else { // If the register allocator cannot place the address registers contiguously @@ -5964,7 +6216,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. // - // Partial NSA is allowed on GFX11 where the final register is a contiguous + // Partial NSA is allowed on GFX11+ where the final register is a contiguous // set of the remaining addresses. const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && @@ -6195,13 +6447,11 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // Handle needing to s.buffer.load() a p8 value. if (hasBufferRsrcWorkaround(Ty)) { Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); - Dst = MI.getOperand(0).getReg(); B.setInsertPt(B.getMBB(), MI); } if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { Ty = getBitcastRegisterType(Ty); Helper.bitcastDst(MI, Ty, 0); - Dst = MI.getOperand(0).getReg(); B.setInsertPt(B.getMBB(), MI); } @@ -6222,10 +6472,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( MemSize, MemAlign); MI.addMemOperand(MF, MMO); - // There are no 96-bit result scalar loads, but widening to 128-bit should + // If we don't have 96-bit result scalar loads, widening to 128-bit should // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. - if (!isPowerOf2_32(Size)) { + if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { if (Ty.isVector()) Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); else @@ -6244,11 +6494,6 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return legalizeTrapEndpgm(MI, MRI, B); - const Module *M = B.getMF().getFunction().getParent(); - unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); - if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) - return legalizeTrapHsaQueuePtr(MI, MRI, B); - return ST.supportsGetDoorbellID() ? legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); } @@ -6395,13 +6640,17 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return false; } + const bool IsGFX11 = AMDGPU::isGFX11(ST); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); + const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; - const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); + const bool UseNSA = + IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); + const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -6409,14 +6658,16 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, int Opcode; if (UseNSA) { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + IsGFX12Plus ? AMDGPU::MIMGEncGfx12 + : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA : AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, NumVAddrDwords); } else { - Opcode = AMDGPU::getMIMGOpcode( - BaseOpcodes[Is64][IsA16], - IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, - NumVDataDwords, NumVAddrDwords); + assert(!IsGFX12Plus); + Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + IsGFX11 ? AMDGPU::MIMGEncGfx11Default + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, NumVAddrDwords); } assert(Opcode != -1); @@ -6539,13 +6790,23 @@ bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, + MachineIRBuilder &B) const { + const SITargetLowering *TLI = ST.getTargetLowering(); + Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); + Register DstReg = MI.getOperand(0).getReg(); + B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; MachineRegisterInfo &MRI = *B.getMRI(); // Replace the use G_BRCOND with the exec manipulate and branch pseudos. - auto IntrID = MI.getIntrinsicID(); + auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 534bb2c87ea3..855fa0ddc214 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -43,8 +43,8 @@ public: bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -68,6 +68,10 @@ public: const GlobalValue *GV, int64_t Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const; + void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, + const GlobalValue *GV, + MachineRegisterInfo &MRI) const; + bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const; @@ -157,6 +161,12 @@ public: bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -201,6 +211,7 @@ public: bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 44ce1e15f0ef..0c21382e5c22 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -14,18 +14,22 @@ #include "AMDGPU.h" #include "AMDGPULibFunc.h" #include "GCNSubtarget.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/AttributeMask.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" -#include "llvm/Target/TargetMachine.h" #include <cmath> #define DEBUG_TYPE "amdgpu-simplifylib" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<bool> EnablePreLink("amdgpu-prelink", cl::desc("Enable pre-link mode optimizations"), @@ -46,10 +50,13 @@ namespace llvm { class AMDGPULibCalls { private: + const TargetLibraryInfo *TLInfo = nullptr; + AssumptionCache *AC = nullptr; + DominatorTree *DT = nullptr; typedef llvm::AMDGPULibFunc FuncInfo; - const TargetMachine *TM; + bool UnsafeFPMath = false; // -fuse-native. bool AllNative = false; @@ -66,64 +73,76 @@ private: /* Specialized optimizations */ - // recip (half or native) - bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // divide (half or native) - bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - // pow/powr/pown - bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); // rootn - bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // fma/mad - bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); // -fuse-native for sincos bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); // evaluate calls if calls' arguments are constants. - bool evaluateScalarMathFunc(const FuncInfo &FInfo, double& Res0, - double& Res1, Constant *copr0, Constant *copr1, Constant *copr2); + bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1, + Constant *copr0, Constant *copr1); bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); // sqrt - bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); + + /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value + /// of cos, sincos call). + std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, + FastMathFlags FMF, + IRBuilder<> &B, + FunctionCallee Fsincos); // sin/cos - bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA); + bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); // __read_pipe/__write_pipe bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - // llvm.amdgcn.wavefrontsize - bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B); - - // Get insertion point at entry. - BasicBlock::iterator getEntryIns(CallInst * UI); - // Insert an Alloc instruction. - AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix); // Get a scalar native builtin single argument FP function FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); + /// Substitute a call to a known libcall with an intrinsic call. If \p + /// AllowMinSize is true, allow the replacement in a minsize function. + bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI, + bool AllowMinSizeF32 = false, + bool AllowF64 = false, + bool AllowStrictFP = false); + void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, + Intrinsic::ID IntrID); + + bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, + Intrinsic::ID IntrID, + bool AllowMinSizeF32 = false, + bool AllowF64 = false, + bool AllowStrictFP = false); + protected: - CallInst *CI; + bool isUnsafeMath(const FPMathOperator *FPOp) const; + bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; + + bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; - bool isUnsafeMath(const CallInst *CI) const; + static void replaceCall(Instruction *I, Value *With) { + I->replaceAllUsesWith(With); + I->eraseFromParent(); + } - void replaceCall(Value *With) { - CI->replaceAllUsesWith(With); - CI->eraseFromParent(); + static void replaceCall(FPMathOperator *I, Value *With) { + replaceCall(cast<Instruction>(I), With); } public: - AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {} + AMDGPULibCalls() {} - bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); + bool fold(CallInst *CI); + void initFunction(Function &F, FunctionAnalysisManager &FAM); void initNativeFuncs(); // Replace a normal math function call with that native version @@ -132,57 +151,6 @@ public: } // end llvm namespace -namespace { - - class AMDGPUSimplifyLibCalls : public FunctionPass { - - AMDGPULibCalls Simplifier; - - public: - static char ID; // Pass identification - - AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr) - : FunctionPass(ID), Simplifier(TM) { - initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AAResultsWrapperPass>(); - } - - bool runOnFunction(Function &M) override; - }; - - class AMDGPUUseNativeCalls : public FunctionPass { - - AMDGPULibCalls Simplifier; - - public: - static char ID; // Pass identification - - AMDGPUUseNativeCalls() : FunctionPass(ID) { - initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry()); - Simplifier.initNativeFuncs(); - } - - bool runOnFunction(Function &F) override; - }; - -} // end anonymous namespace. - -char AMDGPUSimplifyLibCalls::ID = 0; -char AMDGPUUseNativeCalls::ID = 0; - -INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", - "Simplify well-known AMD library calls", false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", - "Simplify well-known AMD library calls", false, false) - -INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative", - "Replace builtin math calls with that native versions.", - false, false) - template <typename IRB> static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, const Twine &Name = "") { @@ -201,6 +169,15 @@ static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, return R; } +static FunctionType *getPownType(FunctionType *FT) { + Type *PowNExpTy = Type::getInt32Ty(FT->getContext()); + if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType())) + PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount()); + + return FunctionType::get(FT->getReturnType(), + {FT->getParamType(0), PowNExpTy}, false); +} + // Data structures for table-driven optimizations. // FuncTbl works for both f32 and f64 functions with 1 input argument @@ -444,13 +421,26 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, return AMDGPULibFunc::parse(FMangledName, FInfo); } -bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const { - if (auto Op = dyn_cast<FPMathOperator>(CI)) - if (Op->isFast()) - return true; - const Function *F = CI->getParent()->getParent(); - Attribute Attr = F->getFnAttribute("unsafe-fp-math"); - return Attr.getValueAsBool(); +bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { + return UnsafeFPMath || FPOp->isFast(); +} + +bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { + return UnsafeFPMath || + (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); +} + +bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( + const FPMathOperator *FPOp) const { + // TODO: Refine to approxFunc or contract + return isUnsafeMath(FPOp); +} + +void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { + UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); + AC = &FAM.getResult<AssumptionAnalysis>(F); + TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); + DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); } bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { @@ -490,7 +480,7 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI << " with native version of sin/cos"); - replaceCall(sinval); + replaceCall(aCI, sinval); return true; } } @@ -498,8 +488,9 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { } bool AMDGPULibCalls::useNative(CallInst *aCI) { - CI = aCI; Function *Callee = aCI->getCalledFunction(); + if (!Callee || aCI->isNoBuiltin()) + return false; FuncInfo FInfo; if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() || @@ -538,29 +529,25 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, assert(Callee->hasName() && "Invalid read_pipe/write_pipe function"); auto *M = Callee->getParent(); - auto &Ctx = M->getContext(); std::string Name = std::string(Callee->getName()); auto NumArg = CI->arg_size(); if (NumArg != 4 && NumArg != 6) return false; - auto *PacketSize = CI->getArgOperand(NumArg - 2); - auto *PacketAlign = CI->getArgOperand(NumArg - 1); - if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign)) + ConstantInt *PacketSize = + dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2)); + ConstantInt *PacketAlign = + dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1)); + if (!PacketSize || !PacketAlign) return false; - unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue(); - Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue(); + + unsigned Size = PacketSize->getZExtValue(); + Align Alignment = PacketAlign->getAlignValue(); if (Alignment != Size) return false; - Type *PtrElemTy; - if (Size <= 8) - PtrElemTy = Type::getIntNTy(Ctx, Size * 8); - else - PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8); unsigned PtrArgLoc = CI->arg_size() - 3; - auto PtrArg = CI->getArgOperand(PtrArgLoc); - unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace(); - auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS); + Value *PtrArg = CI->getArgOperand(PtrArgLoc); + Type *PtrTy = PtrArg->getType(); SmallVector<llvm::Type *, 6> ArgTys; for (unsigned I = 0; I != PtrArgLoc; ++I) @@ -575,11 +562,10 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, if (!F) return false; - auto *BCast = B.CreatePointerCast(PtrArg, PtrTy); SmallVector<Value *, 6> Args; for (unsigned I = 0; I != PtrArgLoc; ++I) Args.push_back(CI->getArgOperand(I)); - Args.push_back(BCast); + Args.push_back(PtrArg); auto *NCI = B.CreateCall(F, Args); NCI->setAttributes(CI->getAttributes()); @@ -590,99 +576,242 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, return true; } -// This function returns false if no change; return true otherwise. -bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { - this->CI = CI; - Function *Callee = CI->getCalledFunction(); +static bool isKnownIntegral(const Value *V, const DataLayout &DL, + FastMathFlags FMF) { + if (isa<UndefValue>(V)) + return true; - // Ignore indirect calls. - if (Callee == nullptr) - return false; + if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) + return CF->getValueAPF().isInteger(); - BasicBlock *BB = CI->getParent(); - LLVMContext &Context = CI->getParent()->getContext(); - IRBuilder<> B(Context); + if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(V)) { + for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) { + Constant *ConstElt = CDV->getElementAsConstant(i); + if (isa<UndefValue>(ConstElt)) + continue; + const ConstantFP *CFP = dyn_cast<ConstantFP>(ConstElt); + if (!CFP || !CFP->getValue().isInteger()) + return false; + } - // Set the builder to the instruction after the call. - B.SetInsertPoint(BB, CI->getIterator()); + return true; + } - // Copy fast flags from the original call. - if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI)) - B.setFastMathFlags(FPOp->getFastMathFlags()); + const Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + switch (I->getOpcode()) { + case Instruction::SIToFP: + case Instruction::UIToFP: + // TODO: Could check nofpclass(inf) on incoming argument + if (FMF.noInfs()) + return true; - switch (Callee->getIntrinsicID()) { + // Need to check int size cannot produce infinity, which computeKnownFPClass + // knows how to do already. + return isKnownNeverInfinity(I, DL); + case Instruction::Call: { + const CallInst *CI = cast<CallInst>(I); + switch (CI->getIntrinsicID()) { + case Intrinsic::trunc: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::roundeven: + return (FMF.noInfs() && FMF.noNaNs()) || + isKnownNeverInfOrNaN(I, DL, nullptr); + default: + break; + } + + break; + } default: break; - case Intrinsic::amdgcn_wavefrontsize: - return !EnablePreLink && fold_wavefrontsize(CI, B); } + return false; +} + +// This function returns false if no change; return true otherwise. +bool AMDGPULibCalls::fold(CallInst *CI) { + Function *Callee = CI->getCalledFunction(); + // Ignore indirect calls. + if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin()) + return false; + FuncInfo FInfo; if (!parseFunctionName(Callee->getName(), FInfo)) return false; // Further check the number of arguments to see if they match. - if (CI->arg_size() != FInfo.getNumArgs()) + // TODO: Check calling convention matches too + if (!FInfo.isCompatibleSignature(CI->getFunctionType())) return false; - if (TDOFold(CI, FInfo)) - return true; + LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n'); - // Under unsafe-math, evaluate calls if possible. - // According to Brian Sumner, we can do this for all f32 function calls - // using host's double function calls. - if (isUnsafeMath(CI) && evaluateCall(CI, FInfo)) + if (TDOFold(CI, FInfo)) return true; - // Specialized optimizations for each function call - switch (FInfo.getId()) { - case AMDGPULibFunc::EI_RECIP: - // skip vector function - assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE || - FInfo.getPrefix() == AMDGPULibFunc::HALF) && - "recip must be an either native or half function"); - return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo); + IRBuilder<> B(CI); - case AMDGPULibFunc::EI_DIVIDE: - // skip vector function - assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE || - FInfo.getPrefix() == AMDGPULibFunc::HALF) && - "divide must be an either native or half function"); - return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo); - - case AMDGPULibFunc::EI_POW: - case AMDGPULibFunc::EI_POWR: - case AMDGPULibFunc::EI_POWN: - return fold_pow(CI, B, FInfo); + if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) { + // Under unsafe-math, evaluate calls if possible. + // According to Brian Sumner, we can do this for all f32 function calls + // using host's double function calls. + if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo)) + return true; - case AMDGPULibFunc::EI_ROOTN: - // skip vector function - return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo); + // Copy fast flags from the original call. + FastMathFlags FMF = FPOp->getFastMathFlags(); + B.setFastMathFlags(FMF); + + // Specialized optimizations for each function call. + // + // TODO: Handle other simple intrinsic wrappers. Sqrt. + // + // TODO: Handle native functions + switch (FInfo.getId()) { + case AMDGPULibFunc::EI_EXP: + if (FMF.none()) + return false; + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp, + FMF.approxFunc()); + case AMDGPULibFunc::EI_EXP2: + if (FMF.none()) + return false; + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2, + FMF.approxFunc()); + case AMDGPULibFunc::EI_LOG: + if (FMF.none()) + return false; + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log, + FMF.approxFunc()); + case AMDGPULibFunc::EI_LOG2: + if (FMF.none()) + return false; + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2, + FMF.approxFunc()); + case AMDGPULibFunc::EI_LOG10: + if (FMF.none()) + return false; + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10, + FMF.approxFunc()); + case AMDGPULibFunc::EI_FMIN: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum, + true, true); + case AMDGPULibFunc::EI_FMAX: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum, + true, true); + case AMDGPULibFunc::EI_FMA: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true, + true); + case AMDGPULibFunc::EI_MAD: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd, + true, true); + case AMDGPULibFunc::EI_FABS: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true, + true, true); + case AMDGPULibFunc::EI_COPYSIGN: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign, + true, true, true); + case AMDGPULibFunc::EI_FLOOR: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true, + true); + case AMDGPULibFunc::EI_CEIL: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true, + true); + case AMDGPULibFunc::EI_TRUNC: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true, + true); + case AMDGPULibFunc::EI_RINT: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true, + true); + case AMDGPULibFunc::EI_ROUND: + return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true, + true); + case AMDGPULibFunc::EI_LDEXP: { + if (!shouldReplaceLibcallWithIntrinsic(CI, true, true)) + return false; - case AMDGPULibFunc::EI_FMA: - case AMDGPULibFunc::EI_MAD: - case AMDGPULibFunc::EI_NFMA: - // skip vector function - return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo); + Value *Arg1 = CI->getArgOperand(1); + if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType()); + VecTy && !isa<VectorType>(Arg1->getType())) { + Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1); + CI->setArgOperand(1, SplatArg1); + } - case AMDGPULibFunc::EI_SQRT: - return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo); - case AMDGPULibFunc::EI_COS: - case AMDGPULibFunc::EI_SIN: - if ((getArgType(FInfo) == AMDGPULibFunc::F32 || - getArgType(FInfo) == AMDGPULibFunc::F64) - && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX)) - return fold_sincos(CI, B, AA); + CI->setCalledFunction(Intrinsic::getDeclaration( + CI->getModule(), Intrinsic::ldexp, + {CI->getType(), CI->getArgOperand(1)->getType()})); + return true; + } + case AMDGPULibFunc::EI_POW: { + Module *M = Callee->getParent(); + AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo); + FunctionCallee PowrFunc = getFunction(M, PowrInfo); + CallInst *Call = cast<CallInst>(FPOp); + + // pow(x, y) -> powr(x, y) for x >= -0.0 + // TODO: Account for flags on current call + if (PowrFunc && + cannotBeOrderedLessThanZero(FPOp->getOperand(0), M->getDataLayout(), + TLInfo, 0, AC, Call, DT)) { + Call->setCalledFunction(PowrFunc); + return fold_pow(FPOp, B, PowrInfo) || true; + } - break; - case AMDGPULibFunc::EI_READ_PIPE_2: - case AMDGPULibFunc::EI_READ_PIPE_4: - case AMDGPULibFunc::EI_WRITE_PIPE_2: - case AMDGPULibFunc::EI_WRITE_PIPE_4: - return fold_read_write_pipe(CI, B, FInfo); + // pow(x, y) -> pown(x, y) for known integral y + if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(), + FPOp->getFastMathFlags())) { + FunctionType *PownType = getPownType(CI->getFunctionType()); + AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true); + FunctionCallee PownFunc = getFunction(M, PownInfo); + if (PownFunc) { + // TODO: If the incoming integral value is an sitofp/uitofp, it won't + // fold out without a known range. We can probably take the source + // value directly. + Value *CastedArg = + B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1)); + // Have to drop any nofpclass attributes on the original call site. + Call->removeParamAttrs( + 1, AttributeFuncs::typeIncompatible(CastedArg->getType())); + Call->setCalledFunction(PownFunc); + Call->setArgOperand(1, CastedArg); + return fold_pow(FPOp, B, PownInfo) || true; + } + } - default: - break; + return fold_pow(FPOp, B, FInfo); + } + case AMDGPULibFunc::EI_POWR: + case AMDGPULibFunc::EI_POWN: + return fold_pow(FPOp, B, FInfo); + case AMDGPULibFunc::EI_ROOTN: + return fold_rootn(FPOp, B, FInfo); + case AMDGPULibFunc::EI_SQRT: + return fold_sqrt(FPOp, B, FInfo); + case AMDGPULibFunc::EI_COS: + case AMDGPULibFunc::EI_SIN: + return fold_sincos(FPOp, B, FInfo); + default: + break; + } + } else { + // Specialized optimizations for each function call + switch (FInfo.getId()) { + case AMDGPULibFunc::EI_READ_PIPE_2: + case AMDGPULibFunc::EI_READ_PIPE_4: + case AMDGPULibFunc::EI_WRITE_PIPE_2: + case AMDGPULibFunc::EI_WRITE_PIPE_4: + return fold_read_write_pipe(CI, B, FInfo); + default: + break; + } } return false; @@ -731,7 +860,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { nval = ConstantDataVector::get(context, tmp); } LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); - replaceCall(nval); + replaceCall(CI, nval); return true; } } else { @@ -741,7 +870,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { if (CF->isExactlyValue(tr[i].input)) { Value *nval = ConstantFP::get(CF->getType(), tr[i].result); LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); - replaceCall(nval); + replaceCall(CI, nval); return true; } } @@ -751,45 +880,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { return false; } -// [native_]half_recip(c) ==> 1.0/c -bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B, - const FuncInfo &FInfo) { - Value *opr0 = CI->getArgOperand(0); - if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) { - // Just create a normal div. Later, InstCombine will be able - // to compute the divide into a constant (avoid check float infinity - // or subnormal at this point). - Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0), - opr0, - "recip2div"); - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); - replaceCall(nval); - return true; - } - return false; -} - -// [native_]half_divide(x, c) ==> x/c -bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B, - const FuncInfo &FInfo) { - Value *opr0 = CI->getArgOperand(0); - Value *opr1 = CI->getArgOperand(1); - ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0); - ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1); - - if ((CF0 && CF1) || // both are constants - (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32))) - // CF1 is constant && f32 divide - { - Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0), - opr1, "__div2recip"); - Value *nval = B.CreateFMul(opr0, nval1, "__div2mul"); - replaceCall(nval); - return true; - } - return false; -} - namespace llvm { static double log2(double V) { #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L @@ -800,81 +890,62 @@ static double log2(double V) { } } -bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, +bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo) { assert((FInfo.getId() == AMDGPULibFunc::EI_POW || FInfo.getId() == AMDGPULibFunc::EI_POWR || FInfo.getId() == AMDGPULibFunc::EI_POWN) && "fold_pow: encounter a wrong function call"); - Value *opr0, *opr1; - ConstantFP *CF; - ConstantInt *CINT; - ConstantAggregateZero *CZero; - Type *eltType; + Module *M = B.GetInsertBlock()->getModule(); + Type *eltType = FPOp->getType()->getScalarType(); + Value *opr0 = FPOp->getOperand(0); + Value *opr1 = FPOp->getOperand(1); - opr0 = CI->getArgOperand(0); - opr1 = CI->getArgOperand(1); - CZero = dyn_cast<ConstantAggregateZero>(opr1); - if (getVecSize(FInfo) == 1) { - eltType = opr0->getType(); - CF = dyn_cast<ConstantFP>(opr1); - CINT = dyn_cast<ConstantInt>(opr1); - } else { - VectorType *VTy = dyn_cast<VectorType>(opr0->getType()); - assert(VTy && "Oprand of vector function should be of vectortype"); - eltType = VTy->getElementType(); - ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1); - - // Now, only Handle vector const whose elements have the same value. - CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr; - CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr; - } - - // No unsafe math , no constant argument, do nothing - if (!isUnsafeMath(CI) && !CF && !CINT && !CZero) - return false; + const APFloat *CF = nullptr; + const APInt *CINT = nullptr; + if (!match(opr1, m_APFloatAllowUndef(CF))) + match(opr1, m_APIntAllowUndef(CINT)); // 0x1111111 means that we don't do anything for this call. int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111); - if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) { + if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) { // pow/powr/pown(x, 0) == 1 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n"); Constant *cnval = ConstantFP::get(eltType, 1.0); if (getVecSize(FInfo) > 1) { cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); } - replaceCall(cnval); + replaceCall(FPOp, cnval); return true; } if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) { // pow/powr/pown(x, 1.0) = x - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n"); - replaceCall(opr0); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n"); + replaceCall(FPOp, opr0); return true; } if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) { // pow/powr/pown(x, 2.0) = x*x - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0 - << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * " + << *opr0 << "\n"); Value *nval = B.CreateFMul(opr0, opr0, "__pow2"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) { // pow/powr/pown(x, -1.0) = 1.0/x - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n"); Constant *cnval = ConstantFP::get(eltType, 1.0); if (getVecSize(FInfo) > 1) { cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); } Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } - Module *M = CI->getModule(); if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) { // pow[r](x, [-]0.5) = sqrt(x) bool issqrt = CF->isExactlyValue(0.5); @@ -882,16 +953,16 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT : AMDGPULibFunc::EI_RSQRT, FInfo))) { - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName() + << '(' << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" : "__pow2rsqrt"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } } - if (!isUnsafeMath(CI)) + if (!isUnsafeFiniteOnlyMath(FPOp)) return false; // Unsafe Math optimization @@ -899,8 +970,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, // Remember that ci_opr1 is set if opr1 is integral if (CF) { double dval = (getArgType(FInfo) == AMDGPULibFunc::F32) - ? (double)CF->getValueAPF().convertToFloat() - : CF->getValueAPF().convertToDouble(); + ? (double)CF->convertToFloat() + : CF->convertToDouble(); int ival = (int)dval; if ((double)ival == dval) { ci_opr1 = ival; @@ -939,31 +1010,39 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, } nval = B.CreateFDiv(cnval, nval, "__1powprod"); } - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } + // If we should use the generic intrinsic instead of emitting a libcall + const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy(); + // powr ---> exp2(y * log2(x)) // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) - FunctionCallee ExpExpr = - getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); - if (!ExpExpr) - return false; + FunctionCallee ExpExpr; + if (ShouldUseIntrinsic) + ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()}); + else { + ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); + if (!ExpExpr) + return false; + } bool needlog = false; bool needabs = false; bool needcopysign = false; Constant *cnval = nullptr; if (getVecSize(FInfo) == 1) { - CF = dyn_cast<ConstantFP>(opr0); + CF = nullptr; + match(opr0, m_APFloatAllowUndef(CF)); if (CF) { double V = (getArgType(FInfo) == AMDGPULibFunc::F32) - ? (double)CF->getValueAPF().convertToFloat() - : CF->getValueAPF().convertToDouble(); + ? (double)CF->convertToFloat() + : CF->convertToDouble(); V = log2(std::abs(V)); cnval = ConstantFP::get(eltType, V); @@ -986,9 +1065,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, SmallVector<double, 0> DVal; for (int i=0; i < getVecSize(FInfo); ++i) { - double V = (getArgType(FInfo) == AMDGPULibFunc::F32) - ? (double)CDV->getElementAsFloat(i) - : CDV->getElementAsDouble(i); + double V = CDV->getElementAsAPFloat(i).convertToDouble(); if (V < 0.0) needcopysign = true; V = log2(std::abs(V)); DVal.push_back(V); @@ -1010,44 +1087,27 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { // We cannot handle corner cases for a general pow() function, give up // unless y is a constant integral value. Then proceed as if it were pown. - if (getVecSize(FInfo) == 1) { - if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) { - double y = (getArgType(FInfo) == AMDGPULibFunc::F32) - ? (double)CF->getValueAPF().convertToFloat() - : CF->getValueAPF().convertToDouble(); - if (y != (double)(int64_t)y) - return false; - } else - return false; - } else { - if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) { - for (int i=0; i < getVecSize(FInfo); ++i) { - double y = (getArgType(FInfo) == AMDGPULibFunc::F32) - ? (double)CDV->getElementAsFloat(i) - : CDV->getElementAsDouble(i); - if (y != (double)(int64_t)y) - return false; - } - } else - return false; - } + if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags())) + return false; } Value *nval; if (needabs) { - FunctionCallee AbsExpr = - getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo)); - if (!AbsExpr) - return false; - nval = CreateCallEx(B, AbsExpr, opr0, "__fabs"); + nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs"); } else { nval = cnval ? cnval : opr0; } if (needlog) { - FunctionCallee LogExpr = - getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); - if (!LogExpr) - return false; + FunctionCallee LogExpr; + if (ShouldUseIntrinsic) { + LogExpr = + Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()}); + } else { + LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); + if (!LogExpr) + return false; + } + nval = CreateCallEx(B,LogExpr, nval, "__log2"); } @@ -1061,14 +1121,14 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if (needcopysign) { Value *opr_n; Type* rTy = opr0->getType(); - Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty(); + Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); Type *nTy = nTyS; if (const auto *vTy = dyn_cast<FixedVectorType>(rTy)) nTy = FixedVectorType::get(nTyS, vTy); unsigned size = nTy->getScalarSizeInBits(); - opr_n = CI->getArgOperand(1); + opr_n = FPOp->getOperand(1); if (opr_n->getType()->isIntegerTy()) - opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou"); + opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); @@ -1078,17 +1138,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, nval = B.CreateBitCast(nval, opr0->getType()); } - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } -bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, +bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo) { - Value *opr0 = CI->getArgOperand(0); - Value *opr1 = CI->getArgOperand(1); + // skip vector function + if (getVecSize(FInfo) != 1) + return false; + + Value *opr0 = FPOp->getOperand(0); + Value *opr1 = FPOp->getOperand(1); ConstantInt *CINT = dyn_cast<ConstantInt>(opr1); if (!CINT) { @@ -1096,90 +1160,47 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, } int ci_opr1 = (int)CINT->getSExtValue(); if (ci_opr1 == 1) { // rootn(x, 1) = x - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n"); - replaceCall(opr0); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n"); + replaceCall(FPOp, opr0); return true; } - if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x) - Module *M = CI->getModule(); + + Module *M = B.GetInsertBlock()->getModule(); + if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x) if (FunctionCallee FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 + << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) - Module *M = CI->getModule(); if (FunctionCallee FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0 + << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n"); Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), opr0, "__rootn2div"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; - } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x) - Module *M = CI->getModule(); + } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x) if (FunctionCallee FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); - replaceCall(nval); - return true; - } - } - return false; -} - -bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, - const FuncInfo &FInfo) { - Value *opr0 = CI->getArgOperand(0); - Value *opr1 = CI->getArgOperand(1); - Value *opr2 = CI->getArgOperand(2); - - ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0); - ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1); - if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) { - // fma/mad(a, b, c) = c if a=0 || b=0 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n"); - replaceCall(opr2); - return true; - } - if (CF0 && CF0->isExactlyValue(1.0f)) { - // fma/mad(a, b, c) = b+c if a=1 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2 - << "\n"); - Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd"); - replaceCall(nval); - return true; - } - if (CF1 && CF1->isExactlyValue(1.0f)) { - // fma/mad(a, b, c) = a+c if b=1 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2 - << "\n"); - Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd"); - replaceCall(nval); - return true; - } - if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) { - if (CF->isZero()) { - // fma/mad(a, b, c) = a*b if c=0 - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " - << *opr1 << "\n"); - Value *nval = B.CreateFMul(opr0, opr1, "fmamul"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } } - return false; } @@ -1193,185 +1214,243 @@ FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, return getFunction(M, nf); } +// Some library calls are just wrappers around llvm intrinsics, but compiled +// conservatively. Preserve the flags from the original call site by +// substituting them with direct calls with all the flags. +bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI, + bool AllowMinSizeF32, + bool AllowF64, + bool AllowStrictFP) { + Type *FltTy = CI->getType()->getScalarType(); + const bool IsF32 = FltTy->isFloatTy(); + + // f64 intrinsics aren't implemented for most operations. + if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy())) + return false; + + // We're implicitly inlining by replacing the libcall with the intrinsic, so + // don't do it for noinline call sites. + if (CI->isNoInline()) + return false; + + const Function *ParentF = CI->getFunction(); + // TODO: Handle strictfp + if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP)) + return false; + + if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize()) + return false; + return true; +} + +void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, + CallInst *CI, + Intrinsic::ID IntrID) { + if (CI->arg_size() == 2) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType()); + VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType()); + if (Arg0VecTy && !Arg1VecTy) { + Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1); + CI->setArgOperand(1, SplatRHS); + } else if (!Arg0VecTy && Arg1VecTy) { + Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0); + CI->setArgOperand(0, SplatLHS); + } + } + + CI->setCalledFunction( + Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()})); +} + +bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( + IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32, + bool AllowF64, bool AllowStrictFP) { + if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64, + AllowStrictFP)) + return false; + replaceLibCallWithSimpleIntrinsic(B, CI, IntrID); + return true; +} + // fold sqrt -> native_sqrt (x) -bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B, +bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo) { + if (!isUnsafeMath(FPOp)) + return false; + if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { + Module *M = B.GetInsertBlock()->getModule(); + if (FunctionCallee FPExpr = getNativeFunction( - CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { - Value *opr0 = CI->getArgOperand(0); - LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { + Value *opr0 = FPOp->getOperand(0); + LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << "sqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); - replaceCall(nval); + replaceCall(FPOp, nval); return true; } } return false; } -// fold sin, cos -> sincos. -bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, - AliasAnalysis *AA) { - AMDGPULibFunc fInfo; - if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo)) - return false; +std::tuple<Value *, Value *, Value *> +AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, + FunctionCallee Fsincos) { + DebugLoc DL = B.getCurrentDebugLocation(); + Function *F = B.GetInsertBlock()->getParent(); + B.SetInsertPointPastAllocas(F); + AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_"); + + if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { + // If the argument is an instruction, it must dominate all uses so put our + // sincos call there. Otherwise, right after the allocas works well enough + // if it's an argument or constant. + + B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); + + // SetInsertPoint unwelcomely always tries to set the debug loc. + B.SetCurrentDebugLocation(DL); + } + + Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1); + + // The allocaInst allocates the memory in private address space. This need + // to be addrspacecasted to point to the address space of cos pointer type. + // In OpenCL 2.0 this is generic, while in 1.2 that is private. + Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy); + + CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc); + + // TODO: Is it worth trying to preserve the location for the cos calls for the + // load? + + LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc); + return {SinCos, LoadCos, SinCos}; +} + +// fold sin, cos -> sincos. +bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, + const FuncInfo &fInfo) { assert(fInfo.getId() == AMDGPULibFunc::EI_SIN || fInfo.getId() == AMDGPULibFunc::EI_COS); + + if ((getArgType(fInfo) != AMDGPULibFunc::F32 && + getArgType(fInfo) != AMDGPULibFunc::F64) || + fInfo.getPrefix() != AMDGPULibFunc::NOPFX) + return false; + bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; - Value *CArgVal = CI->getArgOperand(0); - BasicBlock * const CBB = CI->getParent(); + Value *CArgVal = FPOp->getOperand(0); + CallInst *CI = cast<CallInst>(FPOp); - int const MaxScan = 30; - bool Changed = false; + Function *F = B.GetInsertBlock()->getParent(); + Module *M = F->getParent(); - { // fold in load value. - LoadInst *LI = dyn_cast<LoadInst>(CArgVal); - if (LI && LI->getParent() == CBB) { - BasicBlock::iterator BBI = LI->getIterator(); - Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA); - if (AvailableVal) { - Changed = true; - CArgVal->replaceAllUsesWith(AvailableVal); - if (CArgVal->getNumUses() == 0) - LI->eraseFromParent(); - CArgVal = CI->getArgOperand(0); - } - } - } + // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer + // implementation. Prefer the private form if available. + AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo); + SinCosLibFuncPrivate.getLeads()[0].PtrKind = + AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS); - Module *M = CI->getModule(); - fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN); - std::string const PairName = fInfo.mangle(); + AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo); + SinCosLibFuncGeneric.getLeads()[0].PtrKind = + AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); + + FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate); + FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric); + FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric; + if (!FSinCos) + return false; + + SmallVector<CallInst *> SinCalls; + SmallVector<CallInst *> CosCalls; + SmallVector<CallInst *> SinCosCalls; + FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN, + fInfo); + const std::string PairName = PartnerInfo.mangle(); + + StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName; + StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName(); + const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle(); + const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle(); + + // Intersect the two sets of flags. + FastMathFlags FMF = FPOp->getFastMathFlags(); + MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath); + + SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()}; - CallInst *UI = nullptr; for (User* U : CArgVal->users()) { - CallInst *XI = dyn_cast_or_null<CallInst>(U); - if (!XI || XI == CI || XI->getParent() != CBB) + CallInst *XI = dyn_cast<CallInst>(U); + if (!XI || XI->getFunction() != F || XI->isNoBuiltin()) continue; Function *UCallee = XI->getCalledFunction(); - if (!UCallee || !UCallee->getName().equals(PairName)) + if (!UCallee) continue; - BasicBlock::iterator BBI = CI->getIterator(); - if (BBI == CI->getParent()->begin()) - break; - --BBI; - for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) { - if (cast<Instruction>(BBI) == XI) { - UI = XI; - break; - } + bool Handled = true; + + if (UCallee->getName() == SinName) + SinCalls.push_back(XI); + else if (UCallee->getName() == CosName) + CosCalls.push_back(XI); + else if (UCallee->getName() == SinCosPrivateName || + UCallee->getName() == SinCosGenericName) + SinCosCalls.push_back(XI); + else + Handled = false; + + if (Handled) { + MergeDbgLocs.push_back(XI->getDebugLoc()); + auto *OtherOp = cast<FPMathOperator>(XI); + FMF &= OtherOp->getFastMathFlags(); + FPMath = MDNode::getMostGenericFPMath( + FPMath, XI->getMetadata(LLVMContext::MD_fpmath)); } - if (UI) break; } - if (!UI) - return Changed; - - // Merge the sin and cos. + if (SinCalls.empty() || CosCalls.empty()) + return false; - // for OpenCL 2.0 we have only generic implementation of sincos - // function. - AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); - nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); - FunctionCallee Fsincos = getFunction(M, nf); - if (!Fsincos) - return Changed; + B.setFastMathFlags(FMF); + B.setDefaultFPMathTag(FPMath); + DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs); + B.SetCurrentDebugLocation(DbgLoc); - BasicBlock::iterator ItOld = B.GetInsertPoint(); - AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_"); - B.SetInsertPoint(UI); + auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos); - Value *P = Alloc; - Type *PTy = Fsincos.getFunctionType()->getParamType(1); - // The allocaInst allocates the memory in private address space. This need - // to be bitcasted to point to the address space of cos pointer type. - // In OpenCL 2.0 this is generic, while in 1.2 that is private. - if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - P = B.CreateAddrSpaceCast(Alloc, PTy); - CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P); - - LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with " - << *Call << "\n"); - - if (!isSin) { // CI->cos, UI->sin - B.SetInsertPoint(&*ItOld); - UI->replaceAllUsesWith(&*Call); - Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); - CI->replaceAllUsesWith(Reload); - UI->eraseFromParent(); - CI->eraseFromParent(); - } else { // CI->sin, UI->cos - Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc); - UI->replaceAllUsesWith(Reload); - CI->replaceAllUsesWith(Call); - UI->eraseFromParent(); - CI->eraseFromParent(); - } - return true; -} - -bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { - if (!TM) - return false; + auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) { + for (CallInst *C : Calls) + C->replaceAllUsesWith(Res); - StringRef CPU = TM->getTargetCPU(); - StringRef Features = TM->getTargetFeatureString(); - if ((CPU.empty() || CPU.equals_insensitive("generic")) && - (Features.empty() || !Features.contains_insensitive("wavefrontsize"))) - return false; - - Function *F = CI->getParent()->getParent(); - const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F); - unsigned N = ST.getWavefrontSize(); + // Leave the other dead instructions to avoid clobbering iterators. + }; - LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with " - << N << "\n"); + replaceTrigInsts(SinCalls, Sin); + replaceTrigInsts(CosCalls, Cos); + replaceTrigInsts(SinCosCalls, SinCos); - CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N)); + // It's safe to delete the original now. CI->eraseFromParent(); return true; } -// Get insertion point at entry. -BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { - Function * Func = UI->getParent()->getParent(); - BasicBlock * BB = &Func->getEntryBlock(); - assert(BB && "Entry block not found!"); - BasicBlock::iterator ItNew = BB->begin(); - return ItNew; -} - -// Insert a AllocsInst at the beginning of function entry block. -AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, - const char *prefix) { - BasicBlock::iterator ItNew = getEntryIns(UI); - Function *UCallee = UI->getCalledFunction(); - Type *RetType = UCallee->getReturnType(); - B.SetInsertPoint(&*ItNew); - AllocaInst *Alloc = - B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName()); - Alloc->setAlignment( - Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); - return Alloc; -} - -bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, - double& Res0, double& Res1, - Constant *copr0, Constant *copr1, - Constant *copr2) { +bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, + double &Res1, Constant *copr0, + Constant *copr1) { // By default, opr0/opr1/opr3 holds values of float/double type. // If they are not float/double, each function has to its // operand separately. - double opr0=0.0, opr1=0.0, opr2=0.0; + double opr0 = 0.0, opr1 = 0.0; ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0); ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1); - ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2); if (fpopr0) { opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64) ? fpopr0->getValueAPF().convertToDouble() @@ -1384,12 +1463,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, : (double)fpopr1->getValueAPF().convertToFloat(); } - if (fpopr2) { - opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64) - ? fpopr2->getValueAPF().convertToDouble() - : (double)fpopr2->getValueAPF().convertToFloat(); - } - switch (FInfo.getId()) { default : return false; @@ -1460,10 +1533,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, Res0 = pow(10.0, opr0); return true; - case AMDGPULibFunc::EI_EXPM1: - Res0 = exp(opr0) - 1.0; - return true; - case AMDGPULibFunc::EI_LOG: Res0 = log(opr0); return true; @@ -1492,10 +1561,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, Res0 = sin(MATH_PI * opr0); return true; - case AMDGPULibFunc::EI_SQRT: - Res0 = sqrt(opr0); - return true; - case AMDGPULibFunc::EI_TAN: Res0 = tan(opr0); return true; @@ -1508,15 +1573,7 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, Res0 = tan(MATH_PI * opr0); return true; - case AMDGPULibFunc::EI_RECIP: - Res0 = 1.0 / opr0; - return true; - // two-arg functions - case AMDGPULibFunc::EI_DIVIDE: - Res0 = opr0 / opr1; - return true; - case AMDGPULibFunc::EI_POW: case AMDGPULibFunc::EI_POWR: Res0 = pow(opr0, opr1); @@ -1545,12 +1602,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, Res0 = sin(opr0); Res1 = cos(opr0); return true; - - // three-arg functions - case AMDGPULibFunc::EI_FMA: - case AMDGPULibFunc::EI_MAD: - Res0 = opr0 * opr1 + opr2; - return true; } return false; @@ -1563,7 +1614,6 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { Constant *copr0 = nullptr; Constant *copr1 = nullptr; - Constant *copr2 = nullptr; if (numArgs > 0) { if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr) return false; @@ -1576,11 +1626,6 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { } } - if (numArgs > 2) { - if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr) - return false; - } - // At this point, all arguments to aCI are constants. // max vector size is 16, and sincos will generate two results. @@ -1588,31 +1633,27 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { int FuncVecSize = getVecSize(FInfo); bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); if (FuncVecSize == 1) { - if (!evaluateScalarMathFunc(FInfo, DVal0[0], - DVal1[0], copr0, copr1, copr2)) { + if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1)) { return false; } } else { ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0); ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1); - ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2); for (int i = 0; i < FuncVecSize; ++i) { Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; - Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr; - if (!evaluateScalarMathFunc(FInfo, DVal0[i], - DVal1[i], celt0, celt1, celt2)) { + if (!evaluateScalarMathFunc(FInfo, DVal0[i], DVal1[i], celt0, celt1)) { return false; } } } - LLVMContext &context = CI->getParent()->getParent()->getContext(); + LLVMContext &context = aCI->getContext(); Constant *nval0, *nval1; if (FuncVecSize == 1) { - nval0 = ConstantFP::get(CI->getType(), DVal0[0]); + nval0 = ConstantFP::get(aCI->getType(), DVal0[0]); if (hasTwoResults) - nval1 = ConstantFP::get(CI->getType(), DVal1[0]); + nval1 = ConstantFP::get(aCI->getType(), DVal1[0]); } else { if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector <float, 0> FVal0, FVal1; @@ -1643,59 +1684,17 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { new StoreInst(nval1, aCI->getArgOperand(1), aCI); } - replaceCall(nval0); + replaceCall(aCI, nval0); return true; } -// Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) { - return new AMDGPUSimplifyLibCalls(TM); -} - -FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { - return new AMDGPUUseNativeCalls(); -} - -bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - bool Changed = false; - auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - - LLVM_DEBUG(dbgs() << "AMDIC: process function "; - F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); - - for (auto &BB : F) { - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) { - // Ignore non-calls. - CallInst *CI = dyn_cast<CallInst>(I); - ++I; - // Ignore intrinsics that do not become real instructions. - if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd()) - continue; - - // Ignore indirect calls. - Function *Callee = CI->getCalledFunction(); - if (Callee == nullptr) - continue; - - LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; - dbgs().flush()); - if(Simplifier.fold(CI, AA)) - Changed = true; - } - } - return Changed; -} - PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) { - AMDGPULibCalls Simplifier(&TM); + AMDGPULibCalls Simplifier; Simplifier.initNativeFuncs(); + Simplifier.initFunction(F, AM); bool Changed = false; - auto AA = &AM.getResult<AAManager>(F); LLVM_DEBUG(dbgs() << "AMDIC: process function "; F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); @@ -1705,48 +1704,16 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, // Ignore non-calls. CallInst *CI = dyn_cast<CallInst>(I); ++I; - // Ignore intrinsics that do not become real instructions. - if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd()) - continue; - // Ignore indirect calls. - Function *Callee = CI->getCalledFunction(); - if (Callee == nullptr) - continue; - - LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; - dbgs().flush()); - if (Simplifier.fold(CI, AA)) - Changed = true; + if (CI) { + if (Simplifier.fold(CI)) + Changed = true; + } } } return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -bool AMDGPUUseNativeCalls::runOnFunction(Function &F) { - if (skipFunction(F) || UseNative.empty()) - return false; - - bool Changed = false; - for (auto &BB : F) { - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) { - // Ignore non-calls. - CallInst *CI = dyn_cast<CallInst>(I); - ++I; - if (!CI) continue; - - // Ignore indirect calls. - Function *Callee = CI->getCalledFunction(); - if (Callee == nullptr) - continue; - - if (Simplifier.useNative(CI)) - Changed = true; - } - } - return Changed; -} - PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, FunctionAnalysisManager &AM) { if (UseNative.empty()) @@ -1754,6 +1721,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, AMDGPULibCalls Simplifier; Simplifier.initNativeFuncs(); + Simplifier.initFunction(F, AM); bool Changed = false; for (auto &BB : F) { @@ -1761,15 +1729,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, // Ignore non-calls. CallInst *CI = dyn_cast<CallInst>(I); ++I; - if (!CI) - continue; - - // Ignore indirect calls. - Function *Callee = CI->getCalledFunction(); - if (Callee == nullptr) - continue; - - if (Simplifier.useNative(CI)) + if (CI && Simplifier.useNative(CI)) Changed = true; } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 169a242d74e4..3437b6dc8ae0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -478,7 +478,7 @@ static bool eatTerm(StringRef& mangledName, const char c) { template <size_t N> static bool eatTerm(StringRef& mangledName, const char (&str)[N]) { - if (mangledName.startswith(StringRef(str, N-1))) { + if (mangledName.starts_with(StringRef(str, N - 1))) { drop_front(mangledName, N-1); return true; } @@ -527,6 +527,16 @@ AMDGPUMangledLibFunc::AMDGPUMangledLibFunc( Leads[1] = copyFrom.Leads[1]; } +AMDGPUMangledLibFunc::AMDGPUMangledLibFunc(EFuncId id, FunctionType *FT, + bool SignedInts) { + FuncId = id; + unsigned NumArgs = FT->getNumParams(); + if (NumArgs >= 1) + Leads[0] = Param::getFromTy(FT->getParamType(0), SignedInts); + if (NumArgs >= 2) + Leads[1] = Param::getFromTy(FT->getParamType(1), SignedInts); +} + /////////////////////////////////////////////////////////////////////////////// // Demangling @@ -875,6 +885,50 @@ std::string AMDGPUMangledLibFunc::mangleNameItanium() const { /////////////////////////////////////////////////////////////////////////////// // Misc +AMDGPULibFuncBase::Param AMDGPULibFuncBase::Param::getFromTy(Type *Ty, + bool Signed) { + Param P; + if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) { + P.VectorSize = VT->getNumElements(); + Ty = VT->getElementType(); + } + + switch (Ty->getTypeID()) { + case Type::FloatTyID: + P.ArgType = AMDGPULibFunc::F32; + break; + case Type::DoubleTyID: + P.ArgType = AMDGPULibFunc::F64; + break; + case Type::HalfTyID: + P.ArgType = AMDGPULibFunc::F16; + break; + case Type::IntegerTyID: + switch (cast<IntegerType>(Ty)->getBitWidth()) { + case 8: + P.ArgType = Signed ? AMDGPULibFunc::I8 : AMDGPULibFunc::U8; + break; + case 16: + P.ArgType = Signed ? AMDGPULibFunc::I16 : AMDGPULibFunc::U16; + break; + case 32: + P.ArgType = Signed ? AMDGPULibFunc::I32 : AMDGPULibFunc::U32; + break; + case 64: + P.ArgType = Signed ? AMDGPULibFunc::I64 : AMDGPULibFunc::U64; + break; + default: + llvm_unreachable("unhandled libcall argument type"); + } + + break; + default: + llvm_unreachable("unhandled libcall argument type"); + } + + return P; +} + static Type* getIntrinsicParamType( LLVMContext& C, const AMDGPULibFunc::Param& P, @@ -945,18 +999,25 @@ std::string AMDGPUMangledLibFunc::getName() const { return std::string(OS.str()); } +bool AMDGPULibFunc::isCompatibleSignature(const FunctionType *FuncTy) const { + // TODO: Validate types make sense + return !FuncTy->isVarArg() && FuncTy->getNumParams() == getNumArgs(); +} + Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) { std::string FuncName = fInfo.mangle(); Function *F = dyn_cast_or_null<Function>( M->getValueSymbolTable().lookup(FuncName)); + if (!F || F->isDeclaration()) + return nullptr; - // check formal with actual types conformance - if (F && !F->isDeclaration() - && !F->isVarArg() - && F->arg_size() == fInfo.getNumArgs()) { - return F; - } - return nullptr; + if (F->hasFnAttribute(Attribute::NoBuiltin)) + return nullptr; + + if (!fInfo.isCompatibleSignature(F->getFunctionType())) + return nullptr; + + return F; } FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M, @@ -965,11 +1026,12 @@ FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M, Function *F = dyn_cast_or_null<Function>( M->getValueSymbolTable().lookup(FuncName)); - // check formal with actual types conformance - if (F && !F->isDeclaration() - && !F->isVarArg() - && F->arg_size() == fInfo.getNumArgs()) { - return F; + if (F) { + if (F->hasFnAttribute(Attribute::NoBuiltin)) + return nullptr; + if (!F->isDeclaration() && + fInfo.isCompatibleSignature(F->getFunctionType())) + return F; } FunctionType *FuncTy = fInfo.getFunctionType(*M); @@ -1043,6 +1105,10 @@ AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) { Id, *cast<AMDGPUMangledLibFunc>(CopyFrom.Impl.get()))); } +AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, FunctionType *FT, bool SignedInts) { + Impl.reset(new AMDGPUMangledLibFunc(Id, FT, SignedInts)); +} + AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) { Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT)); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h index bf0fda25b2c0..10551bee3fa8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -18,6 +18,7 @@ class FunctionCallee; class FunctionType; class Function; class Module; +class Type; class AMDGPULibFuncBase { public: @@ -290,18 +291,23 @@ public: }; struct Param { - unsigned char ArgType; - unsigned char VectorSize; - unsigned char PtrKind; + unsigned char ArgType = 0; + unsigned char VectorSize = 1; + unsigned char PtrKind = 0; - unsigned char Reserved; + unsigned char Reserved = 0; void reset() { ArgType = 0; VectorSize = 1; PtrKind = 0; } - Param() { reset(); } + + static Param getIntN(unsigned char NumElts) { + return Param{I32, NumElts, 0, 0}; + } + + static Param getFromTy(Type *Ty, bool Signed); template <typename Stream> void mangleItanium(Stream& os); @@ -351,7 +357,7 @@ public: protected: EFuncId FuncId; std::string Name; - ENamePrefix FKind; + ENamePrefix FKind = NOPFX; }; /// Wrapper class for AMDGPULIbFuncImpl @@ -362,6 +368,8 @@ public: /// Clone a mangled library func with the Id \p Id and argument info from \p /// CopyFrom. explicit AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom); + explicit AMDGPULibFunc(EFuncId Id, FunctionType *FT, bool SignedInts); + /// Construct an unmangled library function on the fly. explicit AMDGPULibFunc(StringRef FName, FunctionType *FT); @@ -383,6 +391,9 @@ public: return Impl->parseFuncName(MangledName); } + // Validate the call type matches the expected libfunc type. + bool isCompatibleSignature(const FunctionType *FuncTy) const; + /// \return The mangled function name for mangled library functions /// and unmangled function name for unmangled library functions. std::string mangle() const { return Impl->mangle(); } @@ -412,6 +423,8 @@ public: explicit AMDGPUMangledLibFunc(); explicit AMDGPUMangledLibFunc(EFuncId id, const AMDGPUMangledLibFunc ©From); + explicit AMDGPUMangledLibFunc(EFuncId id, FunctionType *FT, + bool SignedInts = true); std::string getName() const override; unsigned getNumArgs() const override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index f5323725250f..c32303defe7f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -14,17 +14,59 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" + #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" using namespace llvm; namespace { -class AMDGPULowerKernelArguments : public FunctionPass{ +class PreloadKernelArgInfo { +private: + Function &F; + const GCNSubtarget &ST; + unsigned NumFreeUserSGPRs; + +public: + SmallVector<llvm::Metadata *, 8> KernelArgMetadata; + + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs(); + GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); + + NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs(); + } + + bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, + uint64_t LastExplicitArgOffset) { + // Check if this argument may be loaded into the same register as the + // previous argument. + if (!isAligned(Align(4), ArgOffset) && AllocSize < 4) + return true; + + // Pad SGPRs for kernarg alignment. + unsigned Padding = ArgOffset - LastExplicitArgOffset; + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; + if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) + return false; + + NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); + return true; + } +}; + +class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; @@ -55,14 +97,11 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } -bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { +static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; - auto &TPC = getAnalysis<TargetPassConfig>(); - - const TargetMachine &TM = TPC.getTM<TargetMachine>(); const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); LLVMContext &Ctx = F.getParent()->getContext(); const DataLayout &DL = F.getParent()->getDataLayout(); @@ -87,6 +126,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; + // Preloaded kernel arguments must be sequential. + bool InPreloadSequence = true; + PreloadKernelArgInfo PreloadInfo(F, ST); for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); @@ -98,8 +140,19 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; + // Try to preload this argument into user SGPRs. + if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && + !ST.needsKernargPreloadBackwardsCompatibility() && + !Arg.getType()->isAggregateType()) + if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, + LastExplicitArgOffset)) + continue; + + InPreloadSequence = false; + if (Arg.use_empty()) continue; @@ -232,6 +285,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { return true; } +bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { + auto &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + return lowerKernelArguments(F, TM); +} + INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", @@ -242,3 +301,16 @@ char AMDGPULowerKernelArguments::ID = 0; FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { return new AMDGPULowerKernelArguments(); } + +PreservedAnalyses +AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { + bool Changed = lowerKernelArguments(F, TM); + if (Changed) { + // TODO: Preserves a lot more. + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + + return PreservedAnalyses::all(); +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 26074cf06071..097722157d41 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -286,8 +287,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { if (HasReqdWorkGroupSize) { ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); - UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( - KnownSize, UMin->getType(), false)); + UMin->replaceAllUsesWith(ConstantFoldIntegerCast( + KnownSize, UMin->getType(), false, DL)); } else { UMin->replaceAllUsesWith(ZextGroupSize); } @@ -310,7 +311,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); GroupSize->replaceAllUsesWith( - ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); + ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL)); MadeChange = true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index e3a645977f92..d2a02143e4e7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -177,6 +177,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/BitVector.h" @@ -184,8 +185,8 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" -#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -204,7 +205,6 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include <tuple> #include <vector> #include <cstdio> @@ -252,7 +252,8 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { return {std::move(V)}; } -class AMDGPULowerModuleLDS : public ModulePass { +class AMDGPULowerModuleLDS { + const AMDGPUTargetMachine &TM; static void removeLocalVarsFromUsedLists(Module &M, @@ -291,7 +292,8 @@ class AMDGPULowerModuleLDS : public ModulePass { // equivalent target specific intrinsic which lasts until immediately after // codegen would suffice for that, but one would still need to ensure that // the variables are allocated in the anticpated order. - IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI()); + BasicBlock *Entry = &Func->getEntryBlock(); + IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); Function *Decl = Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); @@ -326,11 +328,7 @@ class AMDGPULowerModuleLDS : public ModulePass { } public: - static char ID; - - AMDGPULowerModuleLDS() : ModulePass(ID) { - initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); - } + AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {} using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>; @@ -854,7 +852,7 @@ public: appendToCompilerUsed(M, {static_cast<GlobalValue *>( ConstantExpr::getPointerBitCastOrAddrSpaceCast( cast<Constant>(ModuleScopeReplacement.SGV), - Type::getInt8PtrTy(Ctx)))}); + PointerType::getUnqual(Ctx)))}); // module.lds will be allocated at zero in any kernel that allocates it recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); @@ -1089,7 +1087,7 @@ public: return KernelToCreatedDynamicLDS; } - bool runOnModule(Module &M) override { + bool runOnModule(Module &M) { CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); @@ -1241,6 +1239,7 @@ public: } if (Offset != 0) { + (void)TM; // TODO: Account for target maximum LDS std::string Buffer; raw_string_ostream SS{Buffer}; SS << format("%u", Offset); @@ -1367,9 +1366,9 @@ private: Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding); LocalVars.push_back(new GlobalVariable( - M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), - "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, - false)); + M, ATy, false, GlobalValue::InternalLinkage, + PoisonValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS, false)); IsPaddingField.push_back(true); CurrentOffset += Padding; } @@ -1391,7 +1390,7 @@ private: Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]); GlobalVariable *SGV = new GlobalVariable( - M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), + M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(LDSTy), VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); SGV->setAlignment(StructAlign); @@ -1530,21 +1529,51 @@ private: } }; +class AMDGPULowerModuleLDSLegacy : public ModulePass { +public: + const AMDGPUTargetMachine *TM; + static char ID; + + AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr) + : ModulePass(ID), TM(TM_) { + initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + if (!TM) + AU.addRequired<TargetPassConfig>(); + } + + bool runOnModule(Module &M) override { + if (!TM) { + auto &TPC = getAnalysis<TargetPassConfig>(); + TM = &TPC.getTM<AMDGPUTargetMachine>(); + } + + return AMDGPULowerModuleLDS(*TM).runOnModule(M); + } +}; + } // namespace -char AMDGPULowerModuleLDS::ID = 0; +char AMDGPULowerModuleLDSLegacy::ID = 0; -char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; +char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID; -INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, - "Lower uses of LDS variables from non-kernel functions", false, - false) +INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) -ModulePass *llvm::createAMDGPULowerModuleLDSPass() { - return new AMDGPULowerModuleLDS(); +ModulePass * +llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { + return new AMDGPULowerModuleLDSLegacy(TM); } PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, ModuleAnalysisManager &) { - return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 44bbfe6f13d9..323462e60a29 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -24,6 +24,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())), IsModuleEntryFunction( AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())), + IsChainFunction(AMDGPU::isChainCC(F.getCallingConv())), NoSignedZerosFPMath(false) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 5780fa64a7e4..248ee26a47eb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -20,7 +20,6 @@ namespace llvm { class AMDGPUSubtarget; -class GCNSubtarget; class AMDGPUMachineFunction : public MachineFunctionInfo { /// A map to keep track of local memory objects and their offsets within the @@ -54,6 +53,9 @@ protected: // Entry points called by other functions instead of directly by the hardware. bool IsModuleEntryFunction = false; + // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve CC. + bool IsChainFunction = false; + bool NoSignedZerosFPMath = false; // Function may be memory bound. @@ -85,6 +87,13 @@ public: bool isModuleEntryFunction() const { return IsModuleEntryFunction; } + bool isChainFunction() const { return IsChainFunction; } + + // The stack is empty upon entry to this function. + bool isBottomOfStack() const { + return isEntryFunction() || isChainFunction(); + } + bool hasNoSignedZerosFPMath() const { return NoSignedZerosFPMath; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index c15c94ee17f8..0cbabf3895a6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -59,7 +59,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, namespace llvm { -std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () { +std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation() { return createMacroFusionDAGMutation(shouldScheduleAdjacent); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 2092707c8a3f..4f5ca08b46c1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -92,9 +92,9 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); if (!HandleTy) { Type *Int32 = Type::getInt32Ty(C); - HandleTy = StructType::create( - C, {Type::getInt8Ty(C)->getPointerTo(0), Int32, Int32}, - "block.runtime.handle.t"); + HandleTy = + StructType::create(C, {PointerType::getUnqual(C), Int32, Int32}, + "block.runtime.handle.t"); } auto *GV = new GlobalVariable( diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 536fb02cb4ec..7b18e1f805d8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -19,9 +19,9 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -42,27 +42,26 @@ namespace { #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES -class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor { +class AMDGPUPostLegalizerCombinerImpl : public Combiner { protected: const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; - - MachineIRBuilder &B; - MachineFunction &MF; - MachineRegisterInfo &MRI; const GCNSubtarget &STI; const SIInstrInfo &TII; - AMDGPUCombinerHelper &Helper; - GISelChangeObserver &Observer; + // TODO: Make CombinerHelper methods const. + mutable AMDGPUCombinerHelper Helper; public: AMDGPUPostLegalizerCombinerImpl( + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, - MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, - GISelChangeObserver &Observer); + const GCNSubtarget &STI, MachineDominatorTree *MDT, + const LegalizerInfo *LI); static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } - bool tryCombineAll(MachineInstr &I) const; + bool tryCombineAllImpl(MachineInstr &I) const; + bool tryCombineAll(MachineInstr &I) const override; struct FMinFMaxLegacyInfo { Register LHS; @@ -120,18 +119,36 @@ private: #undef GET_GICOMBINER_IMPL AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, - MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, - GISelChangeObserver &Observer) - : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()), - STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()), - Helper(Helper), Observer(Observer), + const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) + : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), + TII(*STI.getInstrInfo()), + Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS { } +bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { + if (tryCombineAllImpl(MI)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + return Helper.tryCombineShiftToUnmerge(MI, 32); + } + + return false; +} + bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { // FIXME: Type predicate on pattern @@ -265,17 +282,20 @@ void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) const { - - auto getRcpSrc = [=](const MachineInstr &MI) { - MachineInstr *ResMI = nullptr; - if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && - MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) - ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); - - return ResMI; + auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { + if (!MI.getFlag(MachineInstr::FmContract)) + return nullptr; + + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { + if (GI->is(Intrinsic::amdgcn_rcp)) + return MRI.getVRegDef(MI.getOperand(2).getReg()); + } + return nullptr; }; - auto getSqrtSrc = [=](const MachineInstr &MI) { + auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { + if (!MI.getFlag(MachineInstr::FmContract)) + return nullptr; MachineInstr *SqrtSrcMI = nullptr; auto Match = mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); @@ -287,7 +307,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( // rcp(sqrt(x)) if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { - B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) .addUse(SqrtSrcMI->getOperand(0).getReg()) .setMIFlags(MI.getFlags()); }; @@ -297,13 +317,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( // sqrt(rcp(x)) if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { - B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) + B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) .addUse(RcpSrcMI->getOperand(0).getReg()) .setMIFlags(MI.getFlags()); }; return true; } - return false; } @@ -400,51 +419,6 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( MI.eraseFromParent(); } -class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { - GISelKnownBits *KB; - MachineDominatorTree *MDT; - AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; - -public: - AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - const AMDGPULegalizerInfo *LI, - GISelKnownBits *KB, MachineDominatorTree *MDT) - : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, - /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), - KB(KB), MDT(MDT) { - if (!RuleConfig.parseCommandLineOption()) - report_fatal_error("Invalid rule identifier"); - } - - bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; -}; - -bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { - AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT, - LInfo); - // TODO: Do not re-create the Impl on every inst, it should be per function. - AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer); - Impl.setupMF(*MI.getMF(), KB); - - if (Impl.tryCombineAll(MI)) - return true; - - switch (MI.getOpcode()) { - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: - // On some subtargets, 64-bit shift is a quarter rate instruction. In the - // common case, splitting this into a move and a 32-bit shift is faster and - // the same code size. - return Helper.tryCombineShiftToUnmerge(MI, 32); - } - - return false; -} - // Pass boilerplate // ================ @@ -461,8 +435,10 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + private: bool IsOptNone; + AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; }; } // end anonymous namespace @@ -482,6 +458,9 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); + + if (!RuleConfig.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); } bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { @@ -491,7 +470,7 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { auto *TPC = &getAnalysis<TargetPassConfig>(); const Function &F = MF.getFunction(); bool EnableOpt = - MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const AMDGPULegalizerInfo *LI = @@ -500,10 +479,13 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); - AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), - F.hasMinSize(), LI, KB, MDT); - Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); + + CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); + + AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, + RuleConfig, ST, MDT, LI); + return Impl.combineMachineInstrs(); } char AMDGPUPostLegalizerCombiner::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 936ca54fcf2e..0c7e198810da 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -42,26 +41,25 @@ namespace { #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES -class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor { +class AMDGPUPreLegalizerCombinerImpl : public Combiner { protected: const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; const GCNSubtarget &STI; - - GISelChangeObserver &Observer; - MachineIRBuilder &B; - MachineFunction &MF; - MachineRegisterInfo &MRI; - AMDGPUCombinerHelper &Helper; + // TODO: Make CombinerHelper methods const. + mutable AMDGPUCombinerHelper Helper; public: AMDGPUPreLegalizerCombinerImpl( + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, - const GCNSubtarget &STI, GISelChangeObserver &Observer, - MachineIRBuilder &B, AMDGPUCombinerHelper &Helper); + const GCNSubtarget &STI, MachineDominatorTree *MDT, + const LegalizerInfo *LI); static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; } - bool tryCombineAll(MachineInstr &I) const; + bool tryCombineAllImpl(MachineInstr &MI) const; + bool tryCombineAll(MachineInstr &I) const override; struct ClampI64ToI16MatchInfo { int64_t Cmp1 = 0; @@ -91,17 +89,32 @@ private: #undef GET_GICOMBINER_IMPL AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, - const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B, - AMDGPUCombinerHelper &Helper) - : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()), - MRI(*B.getMRI()), Helper(Helper), + const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) + : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), + Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS { } +bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { + if (tryCombineAllImpl(MI)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); + } + + return false; +} + bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, ClampI64ToI16MatchInfo &MatchInfo) const { @@ -199,49 +212,6 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( MI.eraseFromParent(); } -class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { - GISelKnownBits *KB; - MachineDominatorTree *MDT; - AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; - -public: - AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - GISelKnownBits *KB, MachineDominatorTree *MDT) - : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), - KB(KB), MDT(MDT) { - if (!RuleConfig.parseCommandLineOption()) - report_fatal_error("Invalid rule identifier"); - } - - bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; -}; - -bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { - const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo(); - AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI); - - const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>(); - // TODO: Do not re-create the Impl on every inst, it should be per function. - AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper); - Impl.setupMF(*MI.getMF(), KB); - - if (Impl.tryCombineAll(MI)) - return true; - - switch (MI.getOpcode()) { - case TargetOpcode::G_CONCAT_VECTORS: - return Helper.tryCombineConcatVectors(MI); - case TargetOpcode::G_SHUFFLE_VECTOR: - return Helper.tryCombineShuffleVector(MI); - } - - return false; -} - // Pass boilerplate // ================ @@ -261,6 +231,7 @@ public: private: bool IsOptNone; + AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; }; } // end anonymous namespace @@ -283,6 +254,9 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); + + if (!RuleConfig.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); } bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { @@ -292,19 +266,22 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { auto *TPC = &getAnalysis<TargetPassConfig>(); const Function &F = MF.getFunction(); bool EnableOpt = - MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); - MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); - AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), - F.hasMinSize(), KB, MDT); + // Enable CSE. GISelCSEAnalysisWrapper &Wrapper = getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); - Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, CSEInfo); + const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); + AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, + STI, MDT, STI.getLegalizerInfo()); + return Impl.combineMachineInstrs(); } char AMDGPUPreLegalizerCombiner::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 13f83e298cf4..7b5dc3795b02 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -102,7 +102,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers( bool ArgDump = false; StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx, CurFmtSpecifierIdx - PrevFmtSpecifierIdx); - size_t pTag = CurFmt.find_last_of("%"); + size_t pTag = CurFmt.find_last_of('%'); if (pTag != StringRef::npos) { ArgDump = true; while (pTag && CurFmt[--pTag] == '%') { @@ -439,7 +439,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) { for (auto &U : PrintfFunction->uses()) { if (auto *CI = dyn_cast<CallInst>(U.getUser())) { - if (CI->isCallee(&U)) + if (CI->isCallee(&U) && !CI->isNoBuiltin()) Printfs.push_back(CI); } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 17025867c1da..1bed516fb5c7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -185,7 +185,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) // Move LDS uses from functions to kernels before promote alloca for accurate // estimation of LDS available -INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS) +INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy) INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) @@ -386,7 +386,6 @@ static Value *promoteAllocaUserToVector( }; Type *VecEltTy = VectorTy->getElementType(); - const unsigned NumVecElts = VectorTy->getNumElements(); switch (Inst->getOpcode()) { case Instruction::Load: { @@ -419,11 +418,12 @@ static Value *promoteAllocaUserToVector( auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); - unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue(); Value *SubVec = PoisonValue::get(SubVecTy); for (unsigned K = 0; K < NumLoadedElts; ++K) { + Value *CurIdx = + Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K)); SubVec = Builder.CreateInsertElement( - SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K); + SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K); } if (AccessTy->isPtrOrPtrVectorTy()) @@ -469,6 +469,7 @@ static Value *promoteAllocaUserToVector( assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumWrittenElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + const unsigned NumVecElts = VectorTy->getNumElements(); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); @@ -479,12 +480,13 @@ static Value *promoteAllocaUserToVector( Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue(); Value *CurVec = GetOrLoadCurrentVectorValue(); - for (unsigned K = 0; K < NumWrittenElts && ((IndexVal + K) < NumVecElts); - ++K) { + for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); + K < NumElts; ++K) { + Value *CurIdx = + Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K)); CurVec = Builder.CreateInsertElement( - CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K); + CurVec, Builder.CreateExtractElement(Val, K), CurIdx); } return CurVec; } @@ -679,6 +681,12 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "unsupported load/store as aggregate"); assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); + // Check that this is a simple access of a vector element. + bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple() + : cast<StoreInst>(Inst)->isSimple(); + if (!IsSimple) + return RejectUser(Inst, "not a simple load or store"); + Ptr = Ptr->stripPointerCasts(); // Alloca already accessed as vector. @@ -688,11 +696,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { continue; } - // Check that this is a simple access of a vector element. - bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple() - : cast<StoreInst>(Inst)->isSimple(); - if (!IsSimple) - return RejectUser(Inst, "not a simple load or store"); if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) return RejectUser(Inst, "not a supported access type"); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index c935e384da8e..20e1aaa5419a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -43,29 +42,27 @@ namespace { #include "AMDGPUGenRegBankGICombiner.inc" #undef GET_GICOMBINER_TYPES -class AMDGPURegBankCombinerImpl : public GIMatchTableExecutor { +class AMDGPURegBankCombinerImpl : public Combiner { protected: const AMDGPURegBankCombinerImplRuleConfig &RuleConfig; - - MachineIRBuilder &B; - MachineFunction &MF; - MachineRegisterInfo &MRI; const GCNSubtarget &STI; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; const SIInstrInfo &TII; - CombinerHelper &Helper; - GISelChangeObserver &Observer; + // TODO: Make CombinerHelper methods const. + mutable CombinerHelper Helper; public: AMDGPURegBankCombinerImpl( + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, - MachineIRBuilder &B, CombinerHelper &Helper, - GISelChangeObserver &Observer); + const GCNSubtarget &STI, MachineDominatorTree *MDT, + const LegalizerInfo *LI); static const char *getName() { return "AMDGPURegBankCombinerImpl"; } - bool tryCombineAll(MachineInstr &I) const; + bool tryCombineAll(MachineInstr &I) const override; bool isVgprRegBank(Register Reg) const; Register getAsVgpr(Register Reg) const; @@ -114,12 +111,14 @@ private: #undef GET_GICOMBINER_IMPL AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl( - const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, MachineIRBuilder &B, - CombinerHelper &Helper, GISelChangeObserver &Observer) - : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()), - STI(MF.getSubtarget<GCNSubtarget>()), RBI(*STI.getRegBankInfo()), - TRI(*STI.getRegisterInfo()), TII(*STI.getInstrInfo()), Helper(Helper), - Observer(Observer), + MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, + GISelKnownBits &KB, GISelCSEInfo *CSEInfo, + const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, + const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) + : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), + RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()), + TII(*STI.getInstrInfo()), + Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenRegBankGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS @@ -396,36 +395,6 @@ bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0, return false; } -class AMDGPURegBankCombinerInfo final : public CombinerInfo { - GISelKnownBits *KB; - MachineDominatorTree *MDT; - AMDGPURegBankCombinerImplRuleConfig RuleConfig; - -public: - AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - const AMDGPULegalizerInfo *LI, GISelKnownBits *KB, - MachineDominatorTree *MDT) - : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, - /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), - KB(KB), MDT(MDT) { - if (!RuleConfig.parseCommandLineOption()) - report_fatal_error("Invalid rule identifier"); - } - - bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; -}; - -bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B, /* IsPreLegalize*/ false, KB, MDT); - // TODO: Do not re-create the Impl on every inst, it should be per function. - AMDGPURegBankCombinerImpl Impl(RuleConfig, B, Helper, Observer); - Impl.setupMF(*MI.getMF(), KB); - return Impl.tryCombineAll(MI); -} - // Pass boilerplate // ================ @@ -440,8 +409,10 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + private: bool IsOptNone; + AMDGPURegBankCombinerImplRuleConfig RuleConfig; }; } // end anonymous namespace @@ -461,6 +432,9 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); + + if (!RuleConfig.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); } bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { @@ -470,19 +444,20 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { auto *TPC = &getAnalysis<TargetPassConfig>(); const Function &F = MF.getFunction(); bool EnableOpt = - MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const AMDGPULegalizerInfo *LI = - static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); - GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + + const auto *LI = ST.getLegalizerInfo(); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); - AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(), - LI, KB, MDT); - Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); + + CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); + AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, + RuleConfig, ST, MDT, LI); + return Impl.combineMachineInstrs(); } char AMDGPURegBankCombiner::ID = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0203af32e389..c9412f720c62 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -97,19 +97,25 @@ namespace { // Observer to apply a register bank to new registers created by LegalizerHelper. class ApplyRegBankMapping final : public GISelChangeObserver { private: + MachineIRBuilder &B; const AMDGPURegisterBankInfo &RBI; MachineRegisterInfo &MRI; const RegisterBank *NewBank; SmallVector<MachineInstr *, 4> NewInsts; public: - ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, + ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_, MachineRegisterInfo &MRI_, const RegisterBank *RB) - : RBI(RBI_), MRI(MRI_), NewBank(RB) {} + : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) { + assert(!B.isObservingChanges()); + B.setChangeObserver(*this); + } ~ApplyRegBankMapping() { for (MachineInstr *MI : NewInsts) applyBank(*MI); + + B.stopObservingChanges(); } /// Set any registers that don't have a set register class or bank to SALU. @@ -131,7 +137,8 @@ public: // Replace the extension with a select, which really uses the boolean // source. - MachineIRBuilder B(MI); + B.setInsertPt(*MI.getParent(), MI); + auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); auto False = B.buildConstant(S32, 0); B.buildSelect(DstReg, SrcReg, True, False); @@ -193,6 +200,7 @@ public: }; } + AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), TII(Subtarget.getInstrInfo()) { @@ -221,7 +229,7 @@ bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, - unsigned Size) const { + TypeSize Size) const { // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? if (Dst.getID() == AMDGPU::SGPRRegBankID && (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { @@ -337,7 +345,7 @@ AMDGPURegisterBankInfo::addMappingFromTable( RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getIntrinsicID()) { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { case Intrinsic::amdgcn_readlane: { static const OpRegBankEntry<3> Table[2] = { // Perfectly legal. @@ -378,7 +386,7 @@ RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { - switch (MI.getIntrinsicID()) { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { static const OpRegBankEntry<2> Table[4] = { // Perfectly legal. @@ -632,8 +640,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: return getInstrAlternativeMappingsIntrinsic(MI, MRI); case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); default: break; @@ -758,11 +768,8 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, /// There is additional complexity to try for compare values to identify the /// unique values used. bool AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineIRBuilder &B, - iterator_range<MachineBasicBlock::iterator> Range, - SmallSet<Register, 4> &SGPROperandRegs, - MachineRegisterInfo &MRI) const { - + MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs) const { // Track use registers which have already been expanded with a readfirstlane // sequence. This may have multiple uses if moving a sequence. DenseMap<Register, Register> WaterfalledRegMap; @@ -787,6 +794,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const int OrigRangeSize = std::distance(Range.begin(), Range.end()); #endif + MachineRegisterInfo &MRI = *B.getMRI(); Register SaveExecReg = MRI.createVirtualRegister(WaveRC); Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); @@ -922,8 +930,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // The ballot becomes a no-op during instruction selection. CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, - {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, - false) + {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}) .addReg(CondReg) .getReg(0); MRI.setRegClass(CondReg, WaveRC); @@ -986,37 +993,28 @@ bool AMDGPURegisterBankInfo::collectWaterfallOperands( } bool AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, - ArrayRef<unsigned> OpIndices) const { + MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const { // Use a set to avoid extra readfirstlanes in the case where multiple operands // are the same register. SmallSet<Register, 4> SGPROperandRegs; - if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices)) return false; MachineBasicBlock::iterator I = MI.getIterator(); return executeInWaterfallLoop(B, make_range(I, std::next(I)), - SGPROperandRegs, MRI); -} - -bool AMDGPURegisterBankInfo::executeInWaterfallLoop( - MachineInstr &MI, MachineRegisterInfo &MRI, - ArrayRef<unsigned> OpIndices) const { - MachineIRBuilder B(MI); - return executeInWaterfallLoop(B, MI, MRI, OpIndices); + SGPROperandRegs); } // Legalize an operand that must be an SGPR by inserting a readfirstlane. void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( - MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { + MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const { Register Reg = MI.getOperand(OpIdx).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); if (Bank == &AMDGPU::SGPRRegBank) return; - MachineIRBuilder B(MI); - Reg = buildReadFirstLane(B, MRI, Reg); MI.getOperand(OpIdx).setReg(Reg); } @@ -1048,9 +1046,11 @@ static LLT widen96To128(LLT Ty) { return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy); } -bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI) const { +bool AMDGPURegisterBankInfo::applyMappingLoad( + MachineIRBuilder &B, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineInstr &MI) const { + MachineRegisterInfo &MRI = *B.getMRI(); Register DstReg = MI.getOperand(0).getReg(); const LLT LoadTy = MRI.getType(DstReg); unsigned LoadSize = LoadTy.getSizeInBits(); @@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, if (DstBank == &AMDGPU::SGPRRegBank) { // There are some special cases that we need to look at for 32 bit and 96 // bit SGPR loads otherwise we have nothing to do. - if (LoadSize != 32 && LoadSize != 96) + if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads())) return false; MachineMemOperand *MMO = *MI.memoperands_begin(); @@ -1076,8 +1076,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, Register PtrReg = MI.getOperand(1).getReg(); - ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); - MachineIRBuilder B(MI, O); + ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); if (LoadSize == 32) { // This is an extending load from a sub-dword size. Widen the memory @@ -1098,10 +1097,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, // 96-bit loads are only available for vector loads. We need to split this // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). if (MMO->getAlign() < Align(16)) { - MachineFunction *MF = MI.getParent()->getParent(); - ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); - MachineIRBuilder B(MI, ApplyBank); - LegalizerHelper Helper(*MF, ApplyBank, B); + LegalizerHelper Helper(B.getMF(), ApplyBank, B); LLT Part64, Part32; std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != @@ -1144,9 +1140,8 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); - ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); - MachineIRBuilder B(MI, Observer); - LegalizerHelper Helper(B.getMF(), Observer, B); + ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); + LegalizerHelper Helper(B.getMF(), O, B); if (LoadTy.isVector()) { if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) @@ -1161,10 +1156,11 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, } bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( - MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI) const { - const MachineFunction &MF = *MI.getMF(); + MachineIRBuilder &B, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineInstr &MI) const { + MachineRegisterInfo &MRI = *B.getMRI(); + const MachineFunction &MF = B.getMF(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const auto &TFI = *ST.getFrameLowering(); @@ -1188,8 +1184,7 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Register SPReg = Info->getStackPtrOffsetReg(); - ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); - MachineIRBuilder B(MI, ApplyBank); + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); @@ -1208,8 +1203,9 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( } bool AMDGPURegisterBankInfo::applyMappingImage( - MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI, int RsrcIdx) const { + MachineIRBuilder &B, MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + int RsrcIdx) const { const int NumDefs = MI.getNumExplicitDefs(); // The reported argument index is relative to the IR intrinsic call arguments, @@ -1230,7 +1226,7 @@ bool AMDGPURegisterBankInfo::applyMappingImage( SGPRIndexes.push_back(I); } - executeInWaterfallLoop(MI, MRI, SGPRIndexes); + executeInWaterfallLoop(B, MI, SGPRIndexes); return true; } @@ -1320,7 +1316,7 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets( } bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( - const OperandsMapper &OpdMapper) const { + MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); @@ -1350,7 +1346,6 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( // immediate offsets. const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); - MachineIRBuilder B(MI); MachineFunction &MF = B.getMF(); Register SOffset; @@ -1421,7 +1416,7 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( OpsToWaterfall.insert(RSrc); executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), - OpsToWaterfall, MRI); + OpsToWaterfall); } if (NumLoads != 1) { @@ -1438,7 +1433,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( return true; } -bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, +bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, + const OperandsMapper &OpdMapper, bool Signed) const { MachineInstr &MI = OpdMapper.getMI(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); @@ -1451,7 +1447,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, const LLT S32 = LLT::scalar(32); - unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1; + unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1; Register SrcReg = MI.getOperand(FirstOpnd).getReg(); Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg(); Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); @@ -1464,8 +1460,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, // There is no 64-bit vgpr bitfield extract instructions so the operation // is expanded to a sequence of instructions that implement the operation. - ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank); - MachineIRBuilder B(MI, ApplyBank); + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); const LLT S64 = LLT::scalar(64); // Shift the source operand so that extracted bits start at bit 0. @@ -1517,8 +1512,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, // The scalar form packs the offset and width in a single operand. - ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); - MachineIRBuilder B(MI, ApplyBank); + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); // Ensure the high bits are clear to insert the offset. auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); @@ -1546,7 +1540,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, } bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( - const OperandsMapper &OpdMapper) const { + MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); @@ -1575,8 +1569,6 @@ bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( } // Keep the multiplication on the SALU. - MachineIRBuilder B(MI); - Register DstHi; Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); bool MulHiInVgpr = false; @@ -1792,7 +1784,7 @@ getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { std::pair<Register, unsigned> AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -1916,8 +1908,9 @@ static void extendLow32IntoHigh32(MachineIRBuilder &B, } bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( - MachineInstr &MI, MachineRegisterInfo &MRI, - const OperandsMapper &OpdMapper) const { + MachineIRBuilder &B, MachineInstr &MI, + const OperandsMapper &OpdMapper) const { + MachineRegisterInfo &MRI = *B.getMRI(); Register VecReg = MI.getOperand(1).getReg(); Register Idx = MI.getOperand(2).getReg(); @@ -1935,7 +1928,6 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( IsDivergentIdx, &Subtarget)) return false; - MachineIRBuilder B(MI); LLT S32 = LLT::scalar(32); const RegisterBank &DstBank = @@ -2014,9 +2006,10 @@ static Register constrainRegToBank(MachineRegisterInfo &MRI, } bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( - MachineInstr &MI, MachineRegisterInfo &MRI, - const OperandsMapper &OpdMapper) const { + MachineIRBuilder &B, MachineInstr &MI, + const OperandsMapper &OpdMapper) const { + MachineRegisterInfo &MRI = *B.getMRI(); Register VecReg = MI.getOperand(1).getReg(); Register Idx = MI.getOperand(3).getReg(); @@ -2033,7 +2026,6 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( IsDivergentIdx, &Subtarget)) return false; - MachineIRBuilder B(MI); LLT S32 = LLT::scalar(32); const RegisterBank &DstBank = @@ -2103,8 +2095,9 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( } void AMDGPURegisterBankInfo::applyMappingImpl( - const OperandsMapper &OpdMapper) const { + MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); + B.setInstrAndDebugLoc(MI); unsigned Opc = MI.getOpcode(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); switch (Opc) { @@ -2123,7 +2116,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DefRegs.empty()) DefRegs.push_back(DstReg); - MachineIRBuilder B(MI); B.setInsertPt(*MI.getParent(), ++MI.getIterator()); Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); @@ -2156,8 +2148,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // produce an invalid copy. We can only copy with some kind of compare to // get a vector boolean result. Insert a register bank copy that will be // correctly lowered to a compare. - MachineIRBuilder B(*MI.getParent()->getParent()); - for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { Register SrcReg = MI.getOperand(I).getReg(); const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); @@ -2179,16 +2169,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl( substituteSimpleCopyRegs(OpdMapper, 0); // Promote SGPR/VGPR booleans to s32 - MachineFunction *MF = MI.getParent()->getParent(); - ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); - MachineIRBuilder B(MI, ApplyBank); - LegalizerHelper Helper(*MF, ApplyBank, B); + ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); + B.setInsertPt(B.getMBB(), MI); + LegalizerHelper Helper(B.getMF(), ApplyBank, B); if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) llvm_unreachable("widen scalar should have succeeded"); return; } + case AMDGPU::G_FCMP: + if (!Subtarget.hasSALUFloatInsts()) + break; + LLVM_FALLTHROUGH; case AMDGPU::G_ICMP: case AMDGPU::G_UADDO: case AMDGPU::G_USUBO: @@ -2196,7 +2189,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_SADDE: case AMDGPU::G_USUBE: case AMDGPU::G_SSUBE: { - unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; + unsigned BoolDstOp = + (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1; Register DstReg = MI.getOperand(BoolDstOp).getReg(); const RegisterBank *DstBank = @@ -2212,7 +2206,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register NewDstReg = MRI.createGenericVirtualRegister(S32); MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); MI.getOperand(BoolDstOp).setReg(NewDstReg); - MachineIRBuilder B(MI); if (HasCarryIn) { Register NewSrcReg = MRI.createGenericVirtualRegister(S32); @@ -2245,7 +2238,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); if (CondBank == &AMDGPU::SGPRRegBank) { - MachineIRBuilder B(MI); const LLT S32 = LLT::scalar(32); Register NewCondReg = MRI.createGenericVirtualRegister(S32); MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); @@ -2257,7 +2249,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstTy.getSizeInBits() != 64) break; - MachineIRBuilder B(MI); LLT HalfTy = getHalfSizedType(DstTy); SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); @@ -2297,7 +2288,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (CondBank == &AMDGPU::SGPRRegBank) { - MachineIRBuilder B(MI); const LLT S32 = LLT::scalar(32); Register NewCondReg = MRI.createGenericVirtualRegister(S32); MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); @@ -2324,8 +2314,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; MachineFunction *MF = MI.getParent()->getParent(); - ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); - MachineIRBuilder B(MI, ApplyBank); + ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank); LegalizerHelper Helper(*MF, ApplyBank, B); if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != @@ -2355,7 +2344,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Depending on where the source registers came from, the generic code may // have decided to split the inputs already or not. If not, we still need to // extract the values. - MachineIRBuilder B(MI); if (Src0Regs.empty()) split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); @@ -2384,8 +2372,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // max combination. if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { MachineFunction *MF = MI.getParent()->getParent(); - ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); - MachineIRBuilder B(MI, Apply); + ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank); LegalizerHelper Helper(*MF, Apply, B); if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) @@ -2420,8 +2407,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl( const LLT S32 = LLT::scalar(32); MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); - ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); - MachineIRBuilder B(MI, ApplySALU); + ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank); + + if (DstTy.isVector() && Opc == AMDGPU::G_ABS) { + Register WideSrcLo, WideSrcHi; + + std::tie(WideSrcLo, WideSrcHi) = + unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT); + auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo}); + auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi}); + B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); + MI.eraseFromParent(); + return; + } if (DstTy.isVector()) { Register WideSrc0Lo, WideSrc0Hi; @@ -2459,10 +2457,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; // Nothing to repair const LLT S32 = LLT::scalar(32); - MachineIRBuilder B(MI); - ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); - GISelObserverWrapper Observer(&O); - B.setChangeObserver(Observer); + ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank); // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs // we would need to further expand, and doesn't let us directly set the @@ -2508,8 +2503,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (Ty == S32) break; - ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); - MachineIRBuilder B(MI, ApplyVALU); + ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); MachineFunction &MF = B.getMF(); LegalizerHelper Helper(MF, ApplyVALU, B); @@ -2539,8 +2533,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) - ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); - MachineIRBuilder B(MI, ApplyVALU); + ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank); SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32 @@ -2569,7 +2562,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty()); - MachineIRBuilder B(MI); const RegisterBank *SrcBank = OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; @@ -2654,11 +2646,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( LLT DstTy = MRI.getType(DstReg); LLT SrcTy = MRI.getType(SrcReg); - if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) + if (foldExtractEltToCmpSelect(B, MI, OpdMapper)) return; - MachineIRBuilder B(MI); - const ValueMapping &DstMapping = OpdMapper.getInstrMapping().getOperandMapping(0); const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; @@ -2693,7 +2683,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstRegs.empty()) { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + executeInWaterfallLoop(B, MI, {2}); if (NeedCopyToVGPR) { // We don't want a phi for this temporary reg. @@ -2752,7 +2742,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( B.setInstr(*Span.begin()); MI.eraseFromParent(); executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), - OpsToWaterfall, MRI); + OpsToWaterfall); if (NeedCopyToVGPR) { MachineBasicBlock *LoopBB = Extract1->getParent(); @@ -2787,7 +2777,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (substituteSimpleCopyRegs(OpdMapper, 1)) MRI.setType(MI.getOperand(1).getReg(), VecTy); - if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) + if (foldInsertEltToCmpSelect(B, MI, OpdMapper)) return; const RegisterBank *IdxBank = @@ -2817,24 +2807,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (InsRegs.empty()) { - executeInWaterfallLoop(MI, MRI, { 3 }); + executeInWaterfallLoop(B, MI, {3}); // Re-insert the constant offset add inside the waterfall loop. if (ShouldMoveIndexIntoLoop) { - MachineIRBuilder B(MI); reinsertVectorIndexAdd(B, MI, 3, ConstOffset); } return; } - assert(InsTy.getSizeInBits() == 64); const LLT S32 = LLT::scalar(32); LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32); - MachineIRBuilder B(MI); auto CastSrc = B.buildBitcast(Vec32, SrcReg); auto One = B.buildConstant(S32, 1); @@ -2881,7 +2868,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Figure out the point after the waterfall loop before mangling the control // flow. executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), - OpsToWaterfall, MRI); + OpsToWaterfall); // The insertion point is now right after the original instruction. // @@ -2913,7 +2900,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {1, 4}); + executeInWaterfallLoop(B, MI, {1, 4}); return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: @@ -2929,27 +2916,28 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {2, 5}); + executeInWaterfallLoop(B, MI, {2, 5}); return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {2, 5}); + executeInWaterfallLoop(B, MI, {2, 5}); return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {3, 6}); + executeInWaterfallLoop(B, MI, {3, 6}); return; } case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { - applyMappingSBufferLoad(OpdMapper); + applyMappingSBufferLoad(B, OpdMapper); return; } - case AMDGPU::G_INTRINSIC: { - switch (MI.getIntrinsicID()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); @@ -2958,7 +2946,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Make sure the index is an SGPR. It doesn't make sense to run this in a // waterfall loop, so assume it's a uniform value. - constrainOpWithReadfirstlane(MI, MRI, 3); // Index + constrainOpWithReadfirstlane(B, MI, 3); // Index return; } case Intrinsic::amdgcn_writelane: { @@ -2967,8 +2955,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(3).empty()); substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val - constrainOpWithReadfirstlane(MI, MRI, 2); // Source value - constrainOpWithReadfirstlane(MI, MRI, 3); // Index + constrainOpWithReadfirstlane(B, MI, 2); // Source value + constrainOpWithReadfirstlane(B, MI, 3); // Index return; } case Intrinsic::amdgcn_interp_p1: @@ -2981,7 +2969,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Readlane for m0 value, which is always the last operand. // FIXME: Should this be a waterfall loop instead? - constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index + constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index return; } case Intrinsic::amdgcn_interp_inreg_p10: @@ -2995,19 +2983,22 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Doing a waterfall loop over these wouldn't make any sense. substituteSimpleCopyRegs(OpdMapper, 2); substituteSimpleCopyRegs(OpdMapper, 3); - constrainOpWithReadfirstlane(MI, MRI, 4); - constrainOpWithReadfirstlane(MI, MRI, 5); + constrainOpWithReadfirstlane(B, MI, 4); + constrainOpWithReadfirstlane(B, MI, 5); return; } case Intrinsic::amdgcn_sbfe: - applyMappingBFE(OpdMapper, true); + applyMappingBFE(B, OpdMapper, true); return; case Intrinsic::amdgcn_ubfe: - applyMappingBFE(OpdMapper, false); + applyMappingBFE(B, OpdMapper, false); return; case Intrinsic::amdgcn_inverse_ballot: + case Intrinsic::amdgcn_s_bitreplicate: + case Intrinsic::amdgcn_s_quadmask: + case Intrinsic::amdgcn_s_wqm: applyDefaultMapping(OpdMapper); - constrainOpWithReadfirstlane(MI, MRI, 2); // Mask + constrainOpWithReadfirstlane(B, MI, 2); // Mask return; case Intrinsic::amdgcn_ballot: // Use default handling and insert copy to vcc source. @@ -3019,30 +3010,31 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { - const AMDGPU::RsrcIntrinsic *RSrcIntrin - = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); + const AMDGPU::RsrcIntrinsic *RSrcIntrin = + AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI)); assert(RSrcIntrin && RSrcIntrin->IsImage); // Non-images can have complications from operands that allow both SGPR // and VGPR. For now it's too complicated to figure out the final opcode // to derive the register bank from the MCInstrDesc. - applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); + applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); return; } case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { unsigned N = MI.getNumExplicitOperands() - 2; applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { N }); + executeInWaterfallLoop(B, MI, {N}); return; } - case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - auto IntrID = MI.getIntrinsicID(); + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { + auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { // This is only allowed to execute with 1 lane, so readfirstlane is safe. assert(OpdMapper.getVRegs(0).empty()); substituteSimpleCopyRegs(OpdMapper, 3); - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } case Intrinsic::amdgcn_ds_gws_init: @@ -3050,62 +3042,85 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_ds_gws_sema_br: { // Only the first lane is executes, so readfirstlane is safe. substituteSimpleCopyRegs(OpdMapper, 1); - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } case Intrinsic::amdgcn_ds_gws_sema_v: case Intrinsic::amdgcn_ds_gws_sema_p: case Intrinsic::amdgcn_ds_gws_sema_release_all: { // Only the first lane is executes, so readfirstlane is safe. - constrainOpWithReadfirstlane(MI, MRI, 1); // M0 + constrainOpWithReadfirstlane(B, MI, 1); // M0 return; } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } case Intrinsic::amdgcn_s_setreg: { - constrainOpWithReadfirstlane(MI, MRI, 2); + constrainOpWithReadfirstlane(B, MI, 2); return; } + case Intrinsic::amdgcn_s_ttracedata: + constrainOpWithReadfirstlane(B, MI, 1); // M0 + return; case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { applyDefaultMapping(OpdMapper); - constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 - constrainOpWithReadfirstlane(MI, MRI, 5); // soffset + constrainOpWithReadfirstlane(B, MI, 1); // rsrc + constrainOpWithReadfirstlane(B, MI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 5); // soffset return; } case Intrinsic::amdgcn_struct_buffer_load_lds: case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { applyDefaultMapping(OpdMapper); - constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc - constrainOpWithReadfirstlane(MI, MRI, 2); // M0 - constrainOpWithReadfirstlane(MI, MRI, 6); // soffset + constrainOpWithReadfirstlane(B, MI, 1); // rsrc + constrainOpWithReadfirstlane(B, MI, 2); // M0 + constrainOpWithReadfirstlane(B, MI, 6); // soffset return; } case Intrinsic::amdgcn_global_load_lds: { applyDefaultMapping(OpdMapper); - constrainOpWithReadfirstlane(MI, MRI, 2); + constrainOpWithReadfirstlane(B, MI, 2); return; } case Intrinsic::amdgcn_lds_direct_load: { applyDefaultMapping(OpdMapper); // Readlane for m0 value, which is always the last operand. - constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index + constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index return; } case Intrinsic::amdgcn_exp_row: applyDefaultMapping(OpdMapper); - constrainOpWithReadfirstlane(MI, MRI, 8); // M0 + constrainOpWithReadfirstlane(B, MI, 8); // M0 + return; + case Intrinsic::amdgcn_s_sleep_var: + assert(OpdMapper.getVRegs(1).empty()); + constrainOpWithReadfirstlane(B, MI, 1); + return; + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + constrainOpWithReadfirstlane(B, MI, 1); return; + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + constrainOpWithReadfirstlane(B, MI, 2); + return; + case Intrinsic::amdgcn_s_barrier_init: + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + return; + case Intrinsic::amdgcn_s_get_barrier_state: { + constrainOpWithReadfirstlane(B, MI, 2); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -3113,7 +3128,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // and VGPR. For now it's too complicated to figure out the final opcode // to derive the register bank from the MCInstrDesc. if (RSrcIntrin->IsImage) { - applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); + applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); return; } } @@ -3214,30 +3229,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } ++End; - MachineIRBuilder B(*Start); - executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI); + B.setInsertPt(B.getMBB(), Start); + executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); break; } case AMDGPU::G_LOAD: case AMDGPU::G_ZEXTLOAD: case AMDGPU::G_SEXTLOAD: { - if (applyMappingLoad(MI, OpdMapper, MRI)) + if (applyMappingLoad(B, OpdMapper, MI)) return; break; } case AMDGPU::G_DYN_STACKALLOC: - applyMappingDynStackAlloc(MI, OpdMapper, MRI); + applyMappingDynStackAlloc(B, OpdMapper, MI); + return; + case AMDGPU::G_STACKRESTORE: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(B, MI, 0); return; + } case AMDGPU::G_SBFX: - applyMappingBFE(OpdMapper, /*Signed*/ true); + applyMappingBFE(B, OpdMapper, /*Signed*/ true); return; case AMDGPU::G_UBFX: - applyMappingBFE(OpdMapper, /*Signed*/ false); + applyMappingBFE(B, OpdMapper, /*Signed*/ false); return; case AMDGPU::G_AMDGPU_MAD_U64_U32: case AMDGPU::G_AMDGPU_MAD_I64_I32: - applyMappingMAD_64_32(OpdMapper); + applyMappingMAD_64_32(B, OpdMapper); return; + case AMDGPU::G_PREFETCH: { + if (!Subtarget.hasPrefetch()) { + MI.eraseFromParent(); + return; + } + unsigned PtrBank = + getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID); + if (PtrBank == AMDGPU::VGPRRegBankID) { + MI.eraseFromParent(); + return; + } + // FIXME: There is currently no support for prefetch in global isel. + // There is no node equivalence and what's worse there is no MMO produced + // for a prefetch on global isel path. + // Prefetch does not affect execution so erase it for now. + MI.eraseFromParent(); + return; + } default: break; } @@ -3542,7 +3580,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); if (MI.getOpcode() != AMDGPU::G_FREEZE && - cannotCopy(*DstBank, *SrcBank, Size)) + cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size))) return getInvalidInstructionMapping(); const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); @@ -3717,40 +3755,68 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_UBFX: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); - [[fallthrough]]; - - case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU - case AMDGPU::G_SSUBSAT: - case AMDGPU::G_UADDSAT: - case AMDGPU::G_USUBSAT: + return getDefaultMappingVOP(MI); case AMDGPU::G_FADD: case AMDGPU::G_FSUB: - case AMDGPU::G_FPTOSI: - case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: case AMDGPU::G_FMA: - case AMDGPU::G_FMAD: - case AMDGPU::G_FSQRT: case AMDGPU::G_FFLOOR: case AMDGPU::G_FCEIL: - case AMDGPU::G_FRINT: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINIMUM: + case AMDGPU::G_FMAXIMUM: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_STRICT_FADD: + case AMDGPU::G_STRICT_FSUB: + case AMDGPU::G_STRICT_FMUL: + case AMDGPU::G_STRICT_FMA: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + unsigned Size = Ty.getSizeInBits(); + if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() && + (Size == 32 || Size == 16) && isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } + case AMDGPU::G_FPTOSI: + case AMDGPU::G_FPTOUI: case AMDGPU::G_SITOFP: - case AMDGPU::G_UITOFP: + case AMDGPU::G_UITOFP: { + unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 && + isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } case AMDGPU::G_FPTRUNC: - case AMDGPU::G_FPEXT: + case AMDGPU::G_FPEXT: { + unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 && + isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } + case AMDGPU::G_FSQRT: case AMDGPU::G_FEXP2: - case AMDGPU::G_FLOG2: + case AMDGPU::G_FLOG2: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && + isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } + case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU + case AMDGPU::G_SSUBSAT: + case AMDGPU::G_UADDSAT: + case AMDGPU::G_USUBSAT: + case AMDGPU::G_FMAD: case AMDGPU::G_FLDEXP: - case AMDGPU::G_FMINNUM: - case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FCANONICALIZE: - case AMDGPU::G_INTRINSIC_TRUNC: - case AMDGPU::G_STRICT_FADD: - case AMDGPU::G_STRICT_FSUB: - case AMDGPU::G_STRICT_FMUL: - case AMDGPU::G_STRICT_FMA: case AMDGPU::G_STRICT_FLDEXP: case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? case AMDGPU::G_FSHR: // TODO: Expand for scalar @@ -3845,9 +3911,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // This case is weird because we expect a physical register in the source, // but need to set a bank anyway. // - // We could select the result to SGPR or VGPR, but for the one current use - // it's more practical to always use VGPR. - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + // TODO: We could select the result to SGPR or VGPR + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; } @@ -3971,14 +4036,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { SrcSize); break; } - case AMDGPU::G_FCMP: { - unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); - OpdsMapping[1] = nullptr; // Predicate Operand. - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); - OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); - break; - } case AMDGPU::G_IS_FPCLASS: { Register SrcReg = MI.getOperand(1).getReg(); unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); @@ -3999,8 +4056,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); break; } - case AMDGPU::G_ICMP: { - auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + case AMDGPU::G_ICMP: + case AMDGPU::G_FCMP: { unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); // See if the result register has already been constrained to vcc, which may @@ -4010,12 +4067,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); + auto canUseSCCICMP = [&]() { + auto Pred = + static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + return Size == 32 || + (Size == 64 && + (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && + Subtarget.hasScalarCompareEq64()); + }; + auto canUseSCCFCMP = [&]() { + return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16); + }; + + bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP; bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && Op2Bank == AMDGPU::SGPRRegBankID && Op3Bank == AMDGPU::SGPRRegBankID && - (Size == 32 || (Size == 64 && - (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - Subtarget.hasScalarCompareEq64())); + (isICMP ? canUseSCCICMP() : canUseSCCFCMP()); DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -4025,6 +4093,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const unsigned ResultSize = 1; OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); + OpdsMapping[1] = nullptr; // Predicate Operand. OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); break; @@ -4197,8 +4266,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); break; } - case AMDGPU::G_INTRINSIC: { - switch (MI.getIntrinsicID()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + switch (cast<GIntrinsic>(MI).getIntrinsicID()) { default: return getInvalidInstructionMapping(); case Intrinsic::amdgcn_div_fmas: @@ -4207,12 +4277,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_sin: case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_log_clamp: - case Intrinsic::amdgcn_log: - case Intrinsic::amdgcn_exp2: - case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: - case Intrinsic::amdgcn_sqrt: - case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: case Intrinsic::amdgcn_fmul_legacy: @@ -4220,7 +4285,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: case Intrinsic::amdgcn_fract: - case Intrinsic::amdgcn_cvt_pkrtz: case Intrinsic::amdgcn_cvt_pknorm_i16: case Intrinsic::amdgcn_cvt_pknorm_u16: case Intrinsic::amdgcn_cvt_pk_i16: @@ -4263,11 +4327,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_sudot8: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: + case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: + case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied: case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_sqrt: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) && + isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_ubfe: if (isSALUMapping(MI)) @@ -4285,8 +4362,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_permlane64: return getDefaultMappingAllVGPR(MI); + case Intrinsic::amdgcn_cvt_pkrtz: + if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_kernarg_segment_ptr: case Intrinsic::amdgcn_s_getpc: case Intrinsic::amdgcn_groupstaticsize: @@ -4387,6 +4469,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_permlane16_var: + case Intrinsic::amdgcn_permlanex16_var: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + break; + } case Intrinsic::amdgcn_mfma_f32_4x4x1f32: case Intrinsic::amdgcn_mfma_f32_4x4x4f16: case Intrinsic::amdgcn_mfma_i32_4x4x4i8: @@ -4514,6 +4605,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } + case Intrinsic::amdgcn_s_quadmask: + case Intrinsic::amdgcn_s_wqm: { + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); + break; + } case Intrinsic::amdgcn_wave_reduce_umin: case Intrinsic::amdgcn_wave_reduce_umax: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -4524,6 +4624,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); break; } + case Intrinsic::amdgcn_s_bitreplicate: + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); } break; } @@ -4531,7 +4636,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { - auto IntrID = MI.getIntrinsicID(); + auto IntrID = AMDGPU::getIntrinsicID(MI); const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); // Non-images can have complications from operands that allow both SGPR @@ -4559,8 +4664,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } - case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { - auto IntrID = MI.getIntrinsicID(); + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: { + auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_s_getreg: case Intrinsic::amdgcn_s_memtime: @@ -4575,9 +4681,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); @@ -4632,6 +4742,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_s_ttracedata: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = + getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); + break; + } case Intrinsic::amdgcn_end_cf: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); @@ -4779,7 +4896,37 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1 break; } - + case Intrinsic::amdgcn_s_sleep_var: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_init: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: { + const unsigned ResultSize = 1; + OpdsMapping[0] = + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_leave: { + const unsigned ResultSize = 1; + OpdsMapping[0] = + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize); + break; + } + case Intrinsic::amdgcn_s_get_barrier_state: { + OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } default: return getInvalidInstructionMapping(); } @@ -4887,6 +5034,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPTRUNC_ROUND_UPWARD: case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: return getDefaultMappingVOP(MI); + case AMDGPU::G_PREFETCH: + OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + break; } return getInstructionMapping(/*ID*/1, /*Cost*/1, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 78214d7a1058..b5d16e70ab23 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -53,43 +53,36 @@ public: MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const; - bool executeInWaterfallLoop( - MachineIRBuilder &B, - iterator_range<MachineBasicBlock::iterator> Range, - SmallSet<Register, 4> &SGPROperandRegs, - MachineRegisterInfo &MRI) const; + bool executeInWaterfallLoop(MachineIRBuilder &B, + iterator_range<MachineBasicBlock::iterator> Range, + SmallSet<Register, 4> &SGPROperandRegs) const; Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Src) const; - bool executeInWaterfallLoop(MachineIRBuilder &B, - MachineInstr &MI, - MachineRegisterInfo &MRI, - ArrayRef<unsigned> OpIndices) const; - bool executeInWaterfallLoop(MachineInstr &MI, - MachineRegisterInfo &MRI, + bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const; - void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, + void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const; - bool applyMappingDynStackAlloc(MachineInstr &MI, + bool applyMappingDynStackAlloc(MachineIRBuilder &B, const OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI) const; - bool applyMappingLoad(MachineInstr &MI, - const OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI) const; - bool - applyMappingImage(MachineInstr &MI, - const OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI, int RSrcIdx) const; + MachineInstr &MI) const; + bool applyMappingLoad(MachineIRBuilder &B, const OperandsMapper &OpdMapper, + MachineInstr &MI) const; + bool applyMappingImage(MachineIRBuilder &B, MachineInstr &MI, + const OperandsMapper &OpdMapper, int RSrcIdx) const; unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const; - bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; + bool applyMappingSBufferLoad(MachineIRBuilder &B, + const OperandsMapper &OpdMapper) const; - bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const; + bool applyMappingBFE(MachineIRBuilder &B, const OperandsMapper &OpdMapper, + bool Signed) const; - bool applyMappingMAD_64_32(const OperandsMapper &OpdMapper) const; + bool applyMappingMAD_64_32(MachineIRBuilder &B, + const OperandsMapper &OpdMapper) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; @@ -98,7 +91,8 @@ public: splitBufferOffsets(MachineIRBuilder &B, Register Offset) const; /// See RegisterBankInfo::applyMapping. - void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + void applyMappingImpl(MachineIRBuilder &Builder, + const OperandsMapper &OpdMapper) const override; const ValueMapping *getValueMappingForPtr(const MachineRegisterInfo &MRI, Register Ptr) const; @@ -171,7 +165,7 @@ public: bool isDivergentRegBank(const RegisterBank *RB) const override; unsigned copyCost(const RegisterBank &A, const RegisterBank &B, - unsigned Size) const override; + TypeSize Size) const override; unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank = nullptr) const override; @@ -186,12 +180,9 @@ public: getInstrMapping(const MachineInstr &MI) const override; private: - - bool foldExtractEltToCmpSelect(MachineInstr &MI, - MachineRegisterInfo &MRI, + bool foldExtractEltToCmpSelect(MachineIRBuilder &B, MachineInstr &MI, const OperandsMapper &OpdMapper) const; - bool foldInsertEltToCmpSelect(MachineInstr &MI, - MachineRegisterInfo &MRI, + bool foldInsertEltToCmpSelect(MachineIRBuilder &B, MachineInstr &MI, const OperandsMapper &OpdMapper) const; }; } // End llvm namespace. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index 580352fb8cf4..552380d54dfd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -89,15 +89,23 @@ const SubtargetSubTypeKV *getGPUInfo(const GCNSubtarget &ST, return nullptr; } -constexpr unsigned FeaturesToCheck[] = { - AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureGFX10Insts, - AMDGPU::FeatureGFX9Insts, AMDGPU::FeatureGFX8Insts, - AMDGPU::FeatureDPP, AMDGPU::Feature16BitInsts, - AMDGPU::FeatureDot1Insts, AMDGPU::FeatureDot2Insts, - AMDGPU::FeatureDot3Insts, AMDGPU::FeatureDot4Insts, - AMDGPU::FeatureDot5Insts, AMDGPU::FeatureDot6Insts, - AMDGPU::FeatureDot7Insts, AMDGPU::FeatureDot8Insts, -}; +constexpr unsigned FeaturesToCheck[] = {AMDGPU::FeatureGFX11Insts, + AMDGPU::FeatureGFX10Insts, + AMDGPU::FeatureGFX9Insts, + AMDGPU::FeatureGFX8Insts, + AMDGPU::FeatureDPP, + AMDGPU::Feature16BitInsts, + AMDGPU::FeatureDot1Insts, + AMDGPU::FeatureDot2Insts, + AMDGPU::FeatureDot3Insts, + AMDGPU::FeatureDot4Insts, + AMDGPU::FeatureDot5Insts, + AMDGPU::FeatureDot6Insts, + AMDGPU::FeatureDot7Insts, + AMDGPU::FeatureDot8Insts, + AMDGPU::FeatureExtendedImageInsts, + AMDGPU::FeatureSMemRealTime, + AMDGPU::FeatureSMemTimeInst}; FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) { FeatureBitset Result = Features; @@ -120,7 +128,6 @@ void reportFunctionRemoved(Function &F, unsigned Feature) { << getFeatureName(Feature) << " is not supported on the current target"; }); - return; } } // end anonymous namespace diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 804bf503e4f9..db5d2bbcf5bb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -185,7 +185,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( // // If we only have implicit uses of flat_scr on flat instructions, it is not // really needed. - if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && + if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 2fde7afc0c14..5087f1a90245 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -43,7 +43,6 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/AttributeMask.h" @@ -331,6 +330,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { NewFunc->removeRetAttrs(RetAttrs); // TODO: How to preserve metadata? + NewFunc->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat); + // Move the body of the function into the new rewritten function, and replace // this function with a stub. NewFunc->splice(NewFunc->begin(), &F); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp index 9c07851243c9..459400e3359c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp @@ -69,11 +69,11 @@ using namespace llvm; namespace { -class AMDGPURewriteUndefForPHI : public FunctionPass { +class AMDGPURewriteUndefForPHILegacy : public FunctionPass { public: static char ID; - AMDGPURewriteUndefForPHI() : FunctionPass(ID) { - initializeAMDGPURewriteUndefForPHIPass(*PassRegistry::getPassRegistry()); + AMDGPURewriteUndefForPHILegacy() : FunctionPass(ID) { + initializeAMDGPURewriteUndefForPHILegacyPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; StringRef getPassName() const override { @@ -91,13 +91,13 @@ public: }; } // end anonymous namespace -char AMDGPURewriteUndefForPHI::ID = 0; +char AMDGPURewriteUndefForPHILegacy::ID = 0; -INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHILegacy, DEBUG_TYPE, "Rewrite undef for PHI", false, false) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE, +INITIALIZE_PASS_END(AMDGPURewriteUndefForPHILegacy, DEBUG_TYPE, "Rewrite undef for PHI", false, false) bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { @@ -170,13 +170,27 @@ bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { return Changed; } -bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) { +bool AMDGPURewriteUndefForPHILegacy::runOnFunction(Function &F) { UniformityInfo &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); return rewritePHIs(F, UA, DT); } -FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() { - return new AMDGPURewriteUndefForPHI(); +PreservedAnalyses +AMDGPURewriteUndefForPHIPass::run(Function &F, FunctionAnalysisManager &AM) { + UniformityInfo &UA = AM.getResult<UniformityInfoAnalysis>(F); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + bool Changed = rewritePHIs(F, UA, DT); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + + return PreservedAnalyses::all(); +} + +FunctionPass *llvm::createAMDGPURewriteUndefForPHILegacyPass() { + return new AMDGPURewriteUndefForPHILegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 317f3f21d240..beb670669581 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -241,9 +241,13 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_ds_fadd>; @@ -333,6 +337,8 @@ def : SourceOfDivergence<int_amdgcn_ds_ordered_add>; def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>; def : SourceOfDivergence<int_amdgcn_permlane16>; def : SourceOfDivergence<int_amdgcn_permlanex16>; +def : SourceOfDivergence<int_amdgcn_permlane16_var>; +def : SourceOfDivergence<int_amdgcn_permlanex16_var>; def : SourceOfDivergence<int_amdgcn_mov_dpp>; def : SourceOfDivergence<int_amdgcn_mov_dpp8>; def : SourceOfDivergence<int_amdgcn_update_dpp>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 9b50f4fa53ac..f19c57668564 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -17,6 +17,7 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "R600Subtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -166,6 +167,10 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {} +bool AMDGPUSubtarget::useRealTrue16Insts() const { + return hasTrue16BitInsts() && EnableRealTrue16Insts; +} + GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM) : // clang-format off @@ -196,14 +201,18 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { case AMDGPU::V_LSHLREV_B64_e64: case AMDGPU::V_LSHLREV_B64_gfx10: case AMDGPU::V_LSHLREV_B64_e64_gfx11: + case AMDGPU::V_LSHLREV_B64_e32_gfx12: + case AMDGPU::V_LSHLREV_B64_e64_gfx12: case AMDGPU::V_LSHL_B64_e64: case AMDGPU::V_LSHRREV_B64_e64: case AMDGPU::V_LSHRREV_B64_gfx10: case AMDGPU::V_LSHRREV_B64_e64_gfx11: + case AMDGPU::V_LSHRREV_B64_e64_gfx12: case AMDGPU::V_LSHR_B64_e64: case AMDGPU::V_ASHRREV_I64_e64: case AMDGPU::V_ASHRREV_I64_gfx10: case AMDGPU::V_ASHRREV_I64_e64_gfx11: + case AMDGPU::V_ASHRREV_I64_e64_gfx12: case AMDGPU::V_ASHR_I64_e64: return 1; } @@ -692,7 +701,7 @@ GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); + return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); } unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { @@ -771,24 +780,26 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { } static unsigned getMaxNumPreloadedSGPRs() { + using USI = GCNUserSGPRUsageInfo; // Max number of user SGPRs - unsigned MaxUserSGPRs = 4 + // private segment buffer - 2 + // Dispatch ptr - 2 + // queue ptr - 2 + // kernel segment ptr - 2 + // dispatch ID - 2 + // flat scratch init - 2; // Implicit buffer ptr + const unsigned MaxUserSGPRs = + USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + + USI::getNumUserSGPRForField(USI::DispatchPtrID) + + USI::getNumUserSGPRForField(USI::QueuePtrID) + + USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + + USI::getNumUserSGPRForField(USI::DispatchIdID) + + USI::getNumUserSGPRForField(USI::FlatScratchInitID) + + USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); // Max number of system SGPRs - unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX - 1 + // WorkGroupIDY - 1 + // WorkGroupIDZ - 1 + // WorkGroupInfo - 1; // private segment wave byte offset + const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset // Max number of synthetic SGPRs - unsigned SyntheticSGPRs = 1; // LDSKernelId + const unsigned SyntheticSGPRs = 1; // LDSKernelId return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; } @@ -994,6 +1005,9 @@ GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { } unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { + if (getGeneration() >= AMDGPUSubtarget::GFX12) + return 0; // Not MIMG encoding. + if (NSAThreshold.getNumOccurrences() > 0) return std::max(NSAThreshold.getValue(), 2u); @@ -1018,3 +1032,79 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct else return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); } + +GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, + const GCNSubtarget &ST) + : ST(ST) { + const CallingConv::ID CC = F.getCallingConv(); + const bool IsKernel = + CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; + // FIXME: Should have analysis or something rather than attribute to detect + // calls. + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + // FIXME: This attribute is a hack, we just need an analysis on the function + // to look for allocas. + const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) + KernargSegmentPtr = true; + + bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + + if (!AMDGPU::isGraphics(CC)) { + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + DispatchPtr = true; + + // FIXME: Can this always be disabled with < COv5? + if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) + QueuePtr = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + DispatchID = true; + } + + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. + if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && + (IsAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; + } + + if (hasImplicitBufferPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); + + if (hasPrivateSegmentBuffer()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); + + if (hasDispatchPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); + + if (hasQueuePtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); + + if (hasKernargSegmentPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); + + if (hasDispatchID()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); + + if (hasFlatScratchInit()) + NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); +} + +void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { + assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); + NumKernargPreloadSGPRs += NumSGPRs; + NumUsedUserSGPRs += NumSGPRs; +} + +unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { + return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 10ce00fe68ca..b72697973be7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -39,7 +39,8 @@ public: VOLCANIC_ISLANDS = 7, GFX9 = 8, GFX10 = 9, - GFX11 = 10 + GFX11 = 10, + GFX12 = 11, }; private: @@ -49,6 +50,7 @@ protected: bool GCN3Encoding = false; bool Has16BitInsts = false; bool HasTrue16BitInsts = false; + bool EnableRealTrue16Insts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; bool HasDsSrc2Insts = false; @@ -153,8 +155,17 @@ public: return Has16BitInsts; } + /// Return true if the subtarget supports True16 instructions. bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } + /// Return true if real (non-fake) variants of True16 instructions using + /// 16-bit registers should be code-generated. Fake True16 instructions are + /// identical to non-fake ones except that they take 32-bit registers as + /// operands and always use their low halves. + // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully + // supported and the support for fake True16 instructions is removed. + bool useRealTrue16Insts() const; + bool hasMadMixInsts() const { return HasMadMixInsts; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 87ef2333e2ea..e8c04ecf39ba 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -50,6 +50,7 @@ #include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/GlobalDCE.h" @@ -173,12 +174,6 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR( "fast", "fast register allocator", createFastVGPRRegisterAllocator); } -static cl::opt<bool> EnableSROA( - "amdgpu-sroa", - cl::desc("Run SROA after promote alloca pass"), - cl::ReallyHidden, - cl::init(true)); - static cl::opt<bool> EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), @@ -291,6 +286,12 @@ static cl::opt<bool> EnableSIModeRegisterPass( cl::init(true), cl::Hidden); +// Enable GFX11.5+ s_singleuse_vdst insertion +static cl::opt<bool> + EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", + cl::desc("Enable s_singleuse_vdst insertion"), + cl::init(false), cl::Hidden); + // Enable GFX11+ s_delay_alu insertion static cl::opt<bool> EnableInsertDelayAlu("amdgpu-enable-delay-alu", @@ -339,6 +340,11 @@ static cl::opt<bool> EnablePromoteKernelArguments( cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true)); +static cl::opt<bool> EnableImageIntrinsicOptimizer( + "amdgpu-enable-image-intrinsic-optimizer", + cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), + cl::Hidden); + static cl::opt<bool> EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), @@ -346,9 +352,14 @@ static cl::opt<bool> EnableMaxIlpSchedStrategy( static cl::opt<bool> EnableRewritePartialRegUses( "amdgpu-enable-rewrite-partial-reg-uses", - cl::desc("Enable rewrite partial reg uses pass"), cl::init(false), + cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableHipStdPar( + "amdgpu-enable-hipstdpar", + cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); @@ -364,6 +375,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); + initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -375,7 +388,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); - initializeAMDGPUAttributorPass(*PR); + initializeAMDGPUAttributorLegacyPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); @@ -393,11 +406,12 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); - initializeAMDGPULowerModuleLDSPass(*PR); + initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); - initializeAMDGPURewriteUndefForPHIPass(*PR); + initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeAMDGPUInsertSingleUseVDSTPass(*PR); initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); @@ -415,14 +429,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); - initializeAMDGPUUseNativeCallsPass(*PR); - initializeAMDGPUSimplifyLibCallsPass(*PR); + initializeAMDGPUImageIntrinsicOptimizerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); initializeGCNPreRALongBranchRegPass(*PR); initializeGCNRewritePartialRegUsesPass(*PR); + initializeGCNRegPressurePrinterPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -441,7 +455,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -451,7 +465,7 @@ static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C) { ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C)); - DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); return DAG; } @@ -525,9 +539,10 @@ static StringRef computeDataLayout(const Triple &TT) { // space 8) which cannot be non-trivilally accessed by LLVM memory operations // like getelementptr. return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" - "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:" + "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-" + "v32:32-v48:64-v96:" "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" - "G1-ni:7:8"; + "G1-ni:7:8:9"; } LLVM_READNONE @@ -553,7 +568,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, TargetOptions Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, - CodeGenOpt::Level OptLevel) + CodeGenOptLevel OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), @@ -588,8 +603,8 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { /// Predicate for Internalize pass. static bool mustPreserveGV(const GlobalValue &GV) { if (const Function *F = dyn_cast<Function>(&GV)) - return F->isDeclaration() || F->getName().startswith("__asan_") || - F->getName().startswith("__sanitizer_") || + return F->isDeclaration() || F->getName().starts_with("__asan_") || + F->getName().starts_with("__sanitizer_") || AMDGPU::isEntryFunctionCC(F->getCallingConv()); GV.removeDeadConstantUsers(); @@ -602,8 +617,12 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineParsingCallback( - [](StringRef PassName, ModulePassManager &PM, - ArrayRef<PassBuilder::PipelineElement>) { + [this](StringRef PassName, ModulePassManager &PM, + ArrayRef<PassBuilder::PipelineElement>) { + if (PassName == "amdgpu-attributor") { + PM.addPass(AMDGPUAttributorPass(*this)); + return true; + } if (PassName == "amdgpu-unify-metadata") { PM.addPass(AMDGPUUnifyMetadataPass()); return true; @@ -617,7 +636,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return true; } if (PassName == "amdgpu-lower-module-lds") { - PM.addPass(AMDGPULowerModuleLDSPass()); + PM.addPass(AMDGPULowerModuleLDSPass(*this)); return true; } if (PassName == "amdgpu-lower-ctor-dtor") { @@ -630,7 +649,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { [this](StringRef PassName, FunctionPassManager &PM, ArrayRef<PassBuilder::PipelineElement>) { if (PassName == "amdgpu-simplifylib") { - PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); + PM.addPass(AMDGPUSimplifyLibCallsPass()); + return true; + } + if (PassName == "amdgpu-image-intrinsic-opt") { + PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this)); return true; } if (PassName == "amdgpu-usenative") { @@ -666,6 +689,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUCodeGenPreparePass(*this)); return true; } + if (PassName == "amdgpu-lower-kernel-arguments") { + PM.addPass(AMDGPULowerKernelArgumentsPass(*this)); + return true; + } + if (PassName == "amdgpu-rewrite-undef-for-phi") { + PM.addPass(AMDGPURewriteUndefForPHIPass()); + return true; + } return false; }); @@ -682,12 +713,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { }); PB.registerPipelineStartEPCallback( - [this](ModulePassManager &PM, OptimizationLevel Level) { + [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify && Level != OptimizationLevel::O0) - FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); + FPM.addPass(AMDGPUSimplifyLibCallsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + if (EnableHipStdPar) + PM.addPass(HipStdParAcceleratorCodeSelectionPass()); }); PB.registerPipelineEarlySimplificationEPCallback( @@ -826,7 +859,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, TargetOptions Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, - CodeGenOpt::Level OL, bool JIT) + CodeGenOptLevel OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} const TargetSubtargetInfo * @@ -894,8 +927,8 @@ public: if (ST.shouldClusterStores()) DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); - DAG->addMutation(createIGroupLPDAGMutation()); - if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true)); + if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -942,7 +975,7 @@ AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOptLevel::Aggressive) addPass(createGVNPass()); else addPass(createEarlyCSEPass()); @@ -966,6 +999,10 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { void AMDGPUPassConfig::addIRPasses() { const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + Triple::ArchType Arch = TM.getTargetTriple().getArch(); + if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn) + addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM)); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); @@ -975,12 +1012,15 @@ void AMDGPUPassConfig::addIRPasses() { if (LowerCtorDtor) addPass(createAMDGPUCtorDtorLoweringLegacyPass()); + if (isPassEnabled(EnableImageIntrinsicOptimizer)) + addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. - if (TM.getTargetTriple().getArch() == Triple::r600) + if (Arch == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); // Replace OpenCL enqueued block function pointers with global variables. @@ -988,24 +1028,29 @@ void AMDGPUPassConfig::addIRPasses() { // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) { - addPass(createAMDGPULowerModuleLDSPass()); + addPass(createAMDGPULowerModuleLDSLegacyPass(&TM)); } // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run // after their introduction - if (TM.getOptLevel() > CodeGenOpt::None) - addPass(createAMDGPUAttributorPass()); + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(createAMDGPUAttributorLegacyPass()); - if (TM.getOptLevel() > CodeGenOpt::None) + if (TM.getOptLevel() > CodeGenOptLevel::None) addPass(createInferAddressSpacesPass()); + // Run atomic optimizer before Atomic Expand + if ((TM.getTargetTriple().getArch() == Triple::amdgcn) && + (TM.getOptLevel() >= CodeGenOptLevel::Less) && + (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { + addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); + } + addPass(createAtomicExpandPass()); - if (TM.getOptLevel() > CodeGenOpt::None) { + if (TM.getOptLevel() > CodeGenOptLevel::None) { addPass(createAMDGPUPromoteAlloca()); - if (EnableSROA) - addPass(createSROAPass()); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(); @@ -1025,7 +1070,7 @@ void AMDGPUPassConfig::addIRPasses() { // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may // have expanded. - if (TM.getOptLevel() > CodeGenOpt::Less) + if (TM.getOptLevel() > CodeGenOptLevel::Less) addPass(createLICMPass()); } @@ -1049,9 +1094,6 @@ void AMDGPUPassConfig::addIRPasses() { void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().getArch() == Triple::amdgcn) { - if (RemoveIncompatibleFunctions) - addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM)); - // FIXME: This pass adds 2 hacky attributes that can be replaced with an // analysis, and should be removed. addPass(createAMDGPUAnnotateKernelFeaturesPass()); @@ -1074,7 +1116,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { } bool AMDGPUPassConfig::addPreISel() { - if (TM->getOptLevel() > CodeGenOpt::None) + if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createFlattenCFGPass()); return false; } @@ -1125,15 +1167,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - if (TM->getOptLevel() > CodeGenOpt::None) + if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createAMDGPULateCodeGenPreparePass()); - if ((TM->getOptLevel() >= CodeGenOpt::Less) && - (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { - addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); - } - - if (TM->getOptLevel() > CodeGenOpt::None) + if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createSinkingPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit @@ -1152,11 +1189,11 @@ bool GCNPassConfig::addPreISel() { // TODO: Move this right after structurizeCFG to avoid extra divergence // analysis. This depends on stopping SIAnnotateControlFlow from making // control flow modifications. - addPass(createAMDGPURewriteUndefForPHIPass()); + addPass(createAMDGPURewriteUndefForPHILegacyPass()); } addPass(createLCSSAPass()); - if (TM->getOptLevel() > CodeGenOpt::Less) + if (TM->getOptLevel() > CodeGenOptLevel::Less) addPass(&AMDGPUPerfHintAnalysisID); return false; @@ -1207,7 +1244,7 @@ bool GCNPassConfig::addIRTranslator() { } void GCNPassConfig::addPreLegalizeMachineIR() { - bool IsOptNone = getOptLevel() == CodeGenOpt::None; + bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); addPass(new Localizer()); } @@ -1218,8 +1255,9 @@ bool GCNPassConfig::addLegalizeMachineIR() { } void GCNPassConfig::addPreRegBankSelect() { - bool IsOptNone = getOptLevel() == CodeGenOpt::None; + bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); + addPass(createAMDGPUGlobalISelDivergenceLoweringPass()); } bool GCNPassConfig::addRegBankSelect() { @@ -1228,7 +1266,7 @@ bool GCNPassConfig::addRegBankSelect() { } void GCNPassConfig::addPreGlobalInstructionSelect() { - bool IsOptNone = getOptLevel() == CodeGenOpt::None; + bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; addPass(createAMDGPURegBankCombiner(IsOptNone)); } @@ -1253,7 +1291,6 @@ void GCNPassConfig::addFastRegAlloc() { insertPass(&PHIEliminationID, &SILowerControlFlowID); insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); - insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); TargetPassConfig::addFastRegAlloc(); } @@ -1262,7 +1299,6 @@ void GCNPassConfig::addOptimizedRegAlloc() { // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation // instructions that cause scheduling barriers. insertPass(&MachineSchedulerID, &SIWholeQuadModeID); - insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); @@ -1275,7 +1311,7 @@ void GCNPassConfig::addOptimizedRegAlloc() { // This is not an essential optimization and it has a noticeable impact on // compilation time, so we only enable it from O2. - if (TM->getOptLevel() > CodeGenOpt::Less) + if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); // FIXME: when an instruction has a Killed operand, and the instruction is @@ -1296,6 +1332,7 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { + addPass(&SILowerWWMCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1348,8 +1385,11 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() { // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsID); + addPass(&SIPreAllocateWWMRegsID); addPass(createVGPRAllocPass(false)); + + addPass(&SILowerWWMCopiesID); return true; } @@ -1369,6 +1409,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsID); + addPass(&SIPreAllocateWWMRegsID); addPass(createVGPRAllocPass(true)); @@ -1380,32 +1421,32 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); - if (getOptLevel() > CodeGenOpt::None) + if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } void GCNPassConfig::addPreSched2() { - if (TM->getOptLevel() > CodeGenOpt::None) + if (TM->getOptLevel() > CodeGenOptLevel::None) addPass(createSIShrinkInstructionsPass()); addPass(&SIPostRABundlerID); } void GCNPassConfig::addPreEmitPass() { - if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIModeRegisterPass()); - if (getOptLevel() > CodeGenOpt::None) + if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); - if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) + if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) addPass(createAMDGPUSetWavePriorityPass()); - if (getOptLevel() > CodeGenOpt::None) + if (getOptLevel() > CodeGenOptLevel::None) addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there @@ -1417,7 +1458,10 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) + if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less)) + addPass(&AMDGPUInsertSingleUseVDSTID); + + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); addPass(&BranchRelaxationPassID); @@ -1458,13 +1502,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo( static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); MachineFunction &MF = PFS.MF; SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) return true; if (MFI->Occupancy == 0) { // Fixup the subtarget dependent default value. - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); } @@ -1618,8 +1662,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->ArgInfo.WorkItemIDZ, 0, 0))) return true; - MFI->Mode.IEEE = YamlMFI.Mode.IEEE; - MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; + if (ST.hasIEEEMode()) + MFI->Mode.IEEE = YamlMFI.Mode.IEEE; + if (ST.hasDX10ClampMode()) + MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; // FIXME: Move proper support for denormal-fp-math into base MachineFunction MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 2426be405a65..9051a61e6557 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -41,7 +41,7 @@ public: AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, std::optional<Reloc::Model> RM, - std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL); + std::optional<CodeModel::Model> CM, CodeGenOptLevel OL); ~AMDGPUTargetMachine() override; const TargetSubtargetInfo *getSubtargetImpl() const; @@ -79,7 +79,7 @@ public: GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, std::optional<Reloc::Model> RM, - std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL, + std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; @@ -137,7 +137,7 @@ public: /// be used given that a pass shall work at an optimization \p Level /// minimum. bool isPassEnabled(const cl::opt<bool> &Opt, - CodeGenOpt::Level Level = CodeGenOpt::Default) const { + CodeGenOptLevel Level = CodeGenOptLevel::Default) const { if (Opt.getNumOccurrences()) return Opt; if (TM->getOptLevel() < Level) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index f854c8c16e5a..584e41bfd546 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -30,7 +30,7 @@ MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const { // Set metadata access for the explicit section StringRef SectionName = GO->getSection(); - if (SectionName.startswith(".AMDGPU.comment.")) + if (SectionName.starts_with(".AMDGPU.comment.")) SK = SectionKind::getMetadata(); return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 81d083c1c88a..f1da1a61bf4d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -296,7 +296,7 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { - SIModeRegisterDefaults Mode(F); + SIModeRegisterDefaults Mode(F, *ST); HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign(); HasFP64FP16Denormals = Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); @@ -368,7 +368,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || - AddrSpace == AMDGPUAS::BUFFER_RESOURCE) { + AddrSpace == AMDGPUAS::BUFFER_RESOURCE || + AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { return 512; } @@ -650,6 +651,15 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } + if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || + TLI->getTargetMachine().Options.UnsafeFPMath)) { + // Fast unsafe fdiv lowering: + // f32 rcp + // f32 fmul + int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost(); + return LT.first * Cost * NElts; + } + if (SLT == MVT::f32 || SLT == MVT::f16) { // 4 more v_cvt_* insts without f16 insts support int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + @@ -883,7 +893,7 @@ bool GCNTTIImpl::isReadRegisterSourceOfDivergence( return true; // Special case scalar registers that start with 'v'. - if (RegName.startswith("vcc") || RegName.empty()) + if (RegName.starts_with("vcc") || RegName.empty()) return false; // VGPR or AGPR is divergent. There aren't any specially named vector @@ -1017,6 +1027,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmin_num: OpIndexes.push_back(0); return true; default: @@ -1091,7 +1103,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmin: { + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmin_num: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); unsigned NewAS = SrcTy->getPointerAddressSpace(); @@ -1114,7 +1128,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef<const Value *> Args) { - Kind = improveShuffleKindFromMask(Kind, Mask); + Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); + if (ST->hasVOP3PInsts()) { if (cast<FixedVectorType>(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { @@ -1153,8 +1168,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - SIModeRegisterDefaults CallerMode(*Caller); - SIModeRegisterDefaults CalleeMode(*Callee); + SIModeRegisterDefaults CallerMode(*Caller, *CallerST); + SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); if (!CallerMode.isInlineCompatible(CalleeMode)) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 9ad841c3c8a5..9bc3ba161c9e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -114,8 +115,6 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const { // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); - // This is a cluster of orthogonal Transforms - AU.addPreservedID(LowerSwitchID); FunctionPass::getAnalysisUsage(AU); AU.addRequired<TargetTransformInfoWrapperPass>(); @@ -192,6 +191,8 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { + assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); + if (PDT.root_size() == 0 || (PDT.root_size() == 1 && !isa<BranchInst>(PDT.getRoot()->getTerminator()))) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b9443559132f..3b69a37728ea 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -75,6 +75,7 @@ public: bool Abs = false; bool Neg = false; bool Sext = false; + bool Lit = false; bool hasFPModifiers() const { return Abs || Neg; } bool hasIntModifiers() const { return Sext; } @@ -273,6 +274,10 @@ public: return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16); } + bool isRegOrImmWithIntT16InputMods() const { + return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::i16); + } + bool isRegOrImmWithInt32InputMods() const { return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32); } @@ -293,6 +298,10 @@ public: return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16); } + bool isRegOrImmWithFPT16InputMods() const { + return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::f16); + } + bool isRegOrImmWithFP32InputMods() const { return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32); } @@ -347,29 +356,24 @@ public: return isImm() && Imm.Type == ImmT; } + template <ImmTy Ty> bool isImmTy() const { return isImmTy(Ty); } + bool isImmLiteral() const { return isImmTy(ImmTyNone); } bool isImmModifier() const { return isImm() && Imm.Type != ImmTyNone; } - bool isClampSI() const { return isImmTy(ImmTyClampSI); } bool isOModSI() const { return isImmTy(ImmTyOModSI); } bool isDMask() const { return isImmTy(ImmTyDMask); } bool isDim() const { return isImmTy(ImmTyDim); } - bool isUNorm() const { return isImmTy(ImmTyUNorm); } - bool isDA() const { return isImmTy(ImmTyDA); } bool isR128A16() const { return isImmTy(ImmTyR128A16); } - bool isA16() const { return isImmTy(ImmTyA16); } - bool isLWE() const { return isImmTy(ImmTyLWE); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } - bool isExpVM() const { return isImmTy(ImmTyExpVM); } - bool isExpCompr() const { return isImmTy(ImmTyExpCompr); } bool isOffen() const { return isImmTy(ImmTyOffen); } bool isIdxen() const { return isImmTy(ImmTyIdxen); } bool isAddr64() const { return isImmTy(ImmTyAddr64); } - bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } + bool isOffset() const { return isImmTy(ImmTyOffset); } bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); } @@ -378,7 +382,6 @@ public: bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } bool isTFE() const { return isImmTy(ImmTyTFE); } - bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isDppRowMask() const { return isImmTy(ImmTyDppRowMask); } @@ -395,7 +398,6 @@ public: bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } bool isNegLo() const { return isImmTy(ImmTyNegLo); } bool isNegHi() const { return isImmTy(ImmTyNegHi); } - bool isHigh() const { return isImmTy(ImmTyHigh); } bool isRegOrImm() const { return isReg() || isImm(); @@ -512,7 +514,15 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64); } + bool isVCSrcTB16() const { + return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::i16); + } + bool isVCSrcTB16_Lo128() const { + return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::i16); + } + + bool isVCSrcFake16B16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::i16); } @@ -532,7 +542,15 @@ public: return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isVCSrcTF16() const { + return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16); + } + bool isVCSrcTF16_Lo128() const { + return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16); + } + + bool isVCSrcFake16F16_Lo128() const { return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::f16); } @@ -552,10 +570,16 @@ public: return isVCSrcF64() || isLiteralImm(MVT::i64); } + bool isVSrcTB16() const { return isVCSrcTB16() || isLiteralImm(MVT::i16); } + bool isVSrcTB16_Lo128() const { return isVCSrcTB16_Lo128() || isLiteralImm(MVT::i16); } + bool isVSrcFake16B16_Lo128() const { + return isVCSrcFake16B16_Lo128() || isLiteralImm(MVT::i16); + } + bool isVSrcB16() const { return isVCSrcB16() || isLiteralImm(MVT::i16); } @@ -588,10 +612,16 @@ public: return isVCSrcF64() || isLiteralImm(MVT::f64); } + bool isVSrcTF16() const { return isVCSrcTF16() || isLiteralImm(MVT::f16); } + bool isVSrcTF16_Lo128() const { return isVCSrcTF16_Lo128() || isLiteralImm(MVT::f16); } + bool isVSrcFake16F16_Lo128() const { + return isVCSrcFake16F16_Lo128() || isLiteralImm(MVT::f16); + } + bool isVSrcF16() const { return isVCSrcF16() || isLiteralImm(MVT::f16); } @@ -863,6 +893,7 @@ public: bool isSDelayALU() const; bool isHwreg() const; bool isSendMsg() const; + bool isSplitBarrier() const; bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMEMOffset() const; @@ -879,6 +910,10 @@ public: bool isWaitVDST() const; bool isWaitEXP() const; + auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const { + return std::bind(P, *this); + } + StringRef getToken() const { assert(isToken()); return StringRef(Tok.Data, Tok.Length); @@ -1344,7 +1379,7 @@ public: // AsmParser::parseDirectiveSet() cannot be specialized for specific target. AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); MCContext &Ctx = getContext(); - if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); @@ -1361,7 +1396,7 @@ public: Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } - if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbi(getSTI())) { initializeGprCountSymbol(IS_VGPR); initializeGprCountSymbol(IS_SGPR); } else @@ -1381,6 +1416,8 @@ public: bool hasG16() const { return AMDGPU::hasG16(getSTI()); } + bool hasGDS() const { return AMDGPU::hasGDS(getSTI()); } + bool isSI() const { return AMDGPU::isSI(getSTI()); } @@ -1424,6 +1461,10 @@ public: return AMDGPU::isGFX11Plus(getSTI()); } + bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); } + + bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); } + bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); } bool isGFX10_BEncoding() const { @@ -1456,10 +1497,16 @@ public: return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding]; } - unsigned getNSAMaxSize() const { - return AMDGPU::getNSAMaxSize(getSTI()); + unsigned getNSAMaxSize(bool HasSampler = false) const { + return AMDGPU::getNSAMaxSize(getSTI(), HasSampler); + } + + unsigned getMaxNumUserSGPRs() const { + return AMDGPU::getMaxNumUserSGPRs(getSTI()); } + bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); } + AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast<AMDGPUTargetStreamer &>(TS); @@ -1493,10 +1540,9 @@ public: std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false); bool ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, SMLoc &EndLoc, bool RestoreOnFailure); - bool parseRegister(MCRegister &RegNo, SMLoc &StartLoc, - SMLoc &EndLoc) override; - OperandMatchResultTy tryParseRegister(MCRegister &RegNo, SMLoc &StartLoc, - SMLoc &EndLoc) override; + bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; + ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, + SMLoc &EndLoc) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; @@ -1531,6 +1577,8 @@ public: AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const; ParseStatus parseCPol(OperandVector &Operands); + ParseStatus parseScope(OperandVector &Operands, int64_t &Scope); + ParseStatus parseTH(OperandVector &Operands, int64_t &TH); ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value, SMLoc &StringLoc); @@ -1540,9 +1588,11 @@ public: bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const; bool parseSP3NegModifier(); - ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false); + ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false, + bool HasLit = false); ParseStatus parseReg(OperandVector &Operands); - ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false); + ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false, + bool HasLit = false); ParseStatus parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); ParseStatus parseRegOrImmWithIntInputMods(OperandVector &Operands, @@ -1616,6 +1666,7 @@ private: SMLoc getInstLoc(const OperandVector &Operands) const; bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands); + bool validateOffset(const MCInst &Inst, const OperandVector &Operands); bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands); bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands); bool validateSOPLiteral(const MCInst &Inst) const; @@ -1640,11 +1691,14 @@ private: bool validateAGPRLdSt(const MCInst &Inst) const; bool validateVGPRAlign(const MCInst &Inst) const; bool validateBLGP(const MCInst &Inst, const OperandVector &Operands); + bool validateDS(const MCInst &Inst, const OperandVector &Operands); bool validateGWS(const MCInst &Inst, const OperandVector &Operands); bool validateDivScale(const MCInst &Inst); bool validateWaitCnt(const MCInst &Inst, const OperandVector &Operands); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); + bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, + const unsigned CPol); bool validateExeczVcczOperands(const OperandVector &Operands); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); std::optional<StringRef> validateLdsDirect(const MCInst &Inst); @@ -1733,7 +1787,6 @@ public: void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); - void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands); bool parseDimId(unsigned &Encoding); ParseStatus parseDim(OperandVector &Operands); @@ -1805,6 +1858,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: @@ -1987,7 +2041,7 @@ bool AMDGPUOperand::isVRegWithInputMods() const { return isRegClass(AMDGPU::VGPR_32RegClassID) || // GFX90A allows DPP on 64-bit operands. (isRegClass(AMDGPU::VReg_64RegClassID) && - AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]); + AsmParser->getFeatureBits()[AMDGPU::FeatureDPALU_DPP]); } bool AMDGPUOperand::isT16VRegWithInputMods() const { @@ -2096,9 +2150,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(), "Can't encode literal as exact 64-bit floating-point operand. " "Low 32-bits will be set to zero"); + Val &= 0xffffffff00000000u; } - Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue())); + Inst.addOperand(MCOperand::createImm(Val)); setImmKindLiteral(); return; } @@ -2133,7 +2188,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: - case AMDGPU::OPERAND_KIMM16: { + case AMDGPU::OPERAND_KIMM16: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -2174,6 +2230,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -2197,7 +2254,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; } - Inst.addOperand(MCOperand::createImm(Lo_32(Val))); + Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? (uint64_t)Val << 32 + : Lo_32(Val); + + Inst.addOperand(MCOperand::createImm(Val)); setImmKindLiteral(); return; @@ -2424,23 +2484,21 @@ bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, return false; } -bool AMDGPUAsmParser::parseRegister(MCRegister &RegNo, SMLoc &StartLoc, +bool AMDGPUAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) { - return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false); + return ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/false); } -OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(MCRegister &RegNo, - SMLoc &StartLoc, - SMLoc &EndLoc) { - bool Result = - ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true); +ParseStatus AMDGPUAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, + SMLoc &EndLoc) { + bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/true); bool PendingErrors = getParser().hasPendingError(); getParser().clearPendingErrors(); if (PendingErrors) - return MatchOperand_ParseFail; + return ParseStatus::Failure; if (Result) - return MatchOperand_NoMatch; - return MatchOperand_Success; + return ParseStatus::NoMatch; + return ParseStatus::Success; } bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, @@ -2517,7 +2575,7 @@ static bool isRegularReg(RegisterKind Kind) { static const RegInfo* getRegularRegInfo(StringRef Str) { for (const RegInfo &Reg : RegularRegisters) - if (Str.startswith(Reg.Name)) + if (Str.starts_with(Reg.Name)) return &Reg; return nullptr; } @@ -2577,7 +2635,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, if (RegKind == IS_SGPR || RegKind == IS_TTMP) { // SGPR and TTMP registers must be aligned. // Max required alignment is 4 dwords. - AlignSize = std::min(RegWidth / 32, 4u); + AlignSize = std::min(llvm::bit_ceil(RegWidth / 32), 4u); } if (RegNum % AlignSize != 0) { @@ -2855,7 +2913,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { return nullptr; } - if (isHsaAbiVersion3AndAbove(&getSTI())) { + if (isHsaAbi(getSTI())) { if (!updateGprCountSymbols(RegKind, RegNum, RegWidth)) return nullptr; } else @@ -2864,13 +2922,26 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { } ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, - bool HasSP3AbsModifier) { + bool HasSP3AbsModifier, bool HasLit) { // TODO: add syntactic sugar for 1/(2*PI) if (isRegister()) return ParseStatus::NoMatch; assert(!isModifier()); + if (!HasLit) { + HasLit = trySkipId("lit"); + if (HasLit) { + if (!skipToken(AsmToken::LParen, "expected left paren after lit")) + return ParseStatus::Failure; + ParseStatus S = parseImm(Operands, HasSP3AbsModifier, HasLit); + if (S.isSuccess() && + !skipToken(AsmToken::RParen, "expected closing parentheses")) + return ParseStatus::Failure; + return S; + } + } + const auto& Tok = getToken(); const auto& NextTok = peekToken(); bool IsReal = Tok.is(AsmToken::Real); @@ -2883,6 +2954,9 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, Negate = true; } + AMDGPUOperand::Modifiers Mods; + Mods.Lit = HasLit; + if (IsReal) { // Floating-point expressions are not supported. // Can only allow floating-point literals with an @@ -2901,6 +2975,8 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, Operands.push_back( AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S, AMDGPUOperand::ImmTyNone, true)); + AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + Op.setModifiers(Mods); return ParseStatus::Success; @@ -2927,7 +3003,11 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, if (Expr->evaluateAsAbsolute(IntVal)) { Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S)); + AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + Op.setModifiers(Mods); } else { + if (HasLit) + return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); } @@ -2950,13 +3030,13 @@ ParseStatus AMDGPUAsmParser::parseReg(OperandVector &Operands) { } ParseStatus AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, - bool HasSP3AbsMod) { + bool HasSP3AbsMod, bool HasLit) { ParseStatus Res = parseReg(Operands); if (!Res.isNoMatch()) return Res; if (isModifier()) return ParseStatus::NoMatch; - return parseImm(Operands, HasSP3AbsMod); + return parseImm(Operands, HasSP3AbsMod, HasLit); } bool @@ -3052,6 +3132,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { bool Neg, SP3Neg; bool Abs, SP3Abs; + bool Lit; SMLoc Loc; // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. @@ -3071,6 +3152,10 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs")) return ParseStatus::Failure; + Lit = trySkipId("lit"); + if (Lit && !skipToken(AsmToken::LParen, "expected left paren after lit")) + return ParseStatus::Failure; + Loc = getLoc(); SP3Abs = trySkipToken(AsmToken::Pipe); if (Abs && SP3Abs) @@ -3078,12 +3163,15 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, ParseStatus Res; if (AllowImm) { - Res = parseRegOrImm(Operands, SP3Abs); + Res = parseRegOrImm(Operands, SP3Abs, Lit); } else { Res = parseReg(Operands); } if (!Res.isSuccess()) - return (SP3Neg || Neg || SP3Abs || Abs) ? ParseStatus::Failure : Res; + return (SP3Neg || Neg || SP3Abs || Abs || Lit) ? ParseStatus::Failure : Res; + + if (Lit && !Operands.back()->isImm()) + Error(Loc, "expected immediate with lit modifier"); if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar")) return ParseStatus::Failure; @@ -3091,12 +3179,15 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, return ParseStatus::Failure; if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses")) return ParseStatus::Failure; + if (Lit && !skipToken(AsmToken::RParen, "expected closing parentheses")) + return ParseStatus::Failure; AMDGPUOperand::Modifiers Mods; Mods.Abs = Abs || SP3Abs; Mods.Neg = Neg || SP3Neg; + Mods.Lit = Lit; - if (Mods.hasFPModifiers()) { + if (Mods.hasFPModifiers() || Lit) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); if (Op.isExpr()) return Error(Op.getStartLoc(), "expected an absolute expression"); @@ -3325,12 +3416,16 @@ unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const { case AMDGPU::V_LSHLREV_B64_e64: case AMDGPU::V_LSHLREV_B64_gfx10: case AMDGPU::V_LSHLREV_B64_e64_gfx11: + case AMDGPU::V_LSHLREV_B64_e32_gfx12: + case AMDGPU::V_LSHLREV_B64_e64_gfx12: case AMDGPU::V_LSHRREV_B64_e64: case AMDGPU::V_LSHRREV_B64_gfx10: case AMDGPU::V_LSHRREV_B64_e64_gfx11: + case AMDGPU::V_LSHRREV_B64_e64_gfx12: case AMDGPU::V_ASHRREV_I64_e64: case AMDGPU::V_ASHRREV_I64_gfx10: case AMDGPU::V_ASHRREV_I64_e64_gfx11: + case AMDGPU::V_ASHRREV_I64_e64_gfx12: case AMDGPU::V_LSHL_B64_e64: case AMDGPU::V_LSHR_B64_e64: case AMDGPU::V_ASHR_I64_e64: @@ -3485,8 +3580,12 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints( : MCRegister::NoRegister; }; + // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache. + bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12; + const auto &InstInfo = getVOPDInstInfo(Opcode, &MII); - auto InvalidCompOprIdx = InstInfo.getInvalidCompOperandIndex(getVRegIdx); + auto InvalidCompOprIdx = + InstInfo.getInvalidCompOperandIndex(getVRegIdx, SkipSrc); if (!InvalidCompOprIdx) return true; @@ -3522,13 +3621,16 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) { return true; } +constexpr uint64_t MIMGFlags = + SIInstrFlags::MIMG | SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE; + bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, const SMLoc &IDLoc) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + if ((Desc.TSFlags & MIMGFlags) == 0) return true; int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); @@ -3574,7 +3676,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10Plus()) + if ((Desc.TSFlags & MIMGFlags) == 0 || !isGFX10Plus()) return true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); @@ -3582,7 +3684,9 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); - int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + int RSrcOpName = Desc.TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc + : AMDGPU::OpName::rsrc; + int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RSrcOpName); int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16); @@ -3590,7 +3694,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, assert(SrsrcIdx != -1); assert(SrsrcIdx > VAddr0Idx); - bool IsA16 = Inst.getOperand(A16Idx).getImm(); + bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm()); if (BaseOpcode->BVH) { if (IsA16 == BaseOpcode->A16) return true; @@ -3609,7 +3713,9 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); if (IsNSA) { - if (hasPartialNSAEncoding() && ExpectedAddrSize > getNSAMaxSize()) { + if (hasPartialNSAEncoding() && + ExpectedAddrSize > + getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) { int VAddrLastIdx = SrsrcIdx - 1; unsigned VAddrLastSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4; @@ -3639,7 +3745,7 @@ bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + if ((Desc.TSFlags & MIMGFlags) == 0) return true; if (!Desc.mayLoad() || !Desc.mayStore()) return true; // Not atomic @@ -3677,7 +3783,7 @@ bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + if ((Desc.TSFlags & MIMGFlags) == 0) return true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); @@ -3854,7 +3960,7 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + if ((Desc.TSFlags & MIMGFlags) == 0) return true; int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16); @@ -4038,6 +4144,40 @@ SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const { return getLoc(); } +bool AMDGPUAsmParser::validateOffset(const MCInst &Inst, + const OperandVector &Operands) { + auto Opcode = Inst.getOpcode(); + auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset); + if (OpNum == -1) + return true; + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & SIInstrFlags::FLAT)) + return validateFlatOffset(Inst, Operands); + + if ((TSFlags & SIInstrFlags::SMRD)) + return validateSMEMOffset(Inst, Operands); + + const auto &Op = Inst.getOperand(OpNum); + if (isGFX12Plus() && + (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) { + const unsigned OffsetSize = 24; + if (!isIntN(OffsetSize, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset"); + return false; + } + } else { + const unsigned OffsetSize = 16; + if (!isUIntN(OffsetSize, Op.getImm())) { + Error(getFlatOffsetLoc(Operands), + Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset"); + return false; + } + } + return true; +} + bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, const OperandVector &Operands) { uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; @@ -4055,11 +4195,12 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, return false; } - // For FLAT segment the offset must be positive; + // For pre-GFX12 FLAT instructions the offset must be positive; // MSB is ignored and forced to zero. unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI()); bool AllowNegative = - TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch); + (TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)) || + isGFX12Plus(); if (!isIntN(OffsetSize, Op.getImm()) || (!AllowNegative && Op.getImm() < 0)) { Error(getFlatOffsetLoc(Operands), Twine("expected a ") + @@ -4106,8 +4247,9 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst, return true; Error(getSMEMOffsetLoc(Operands), - (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" : - "expected a 21-bit signed offset"); + isGFX12Plus() ? "expected a 24-bit signed offset" + : (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" + : "expected a 21-bit signed offset"); return false; } @@ -4189,21 +4331,35 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, const OperandVector &Operands) { const unsigned Opc = Inst.getOpcode(); int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl); - if (DppCtrlIdx < 0) - return true; - unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); + if (DppCtrlIdx >= 0) { + unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); - if (!AMDGPU::isLegal64BitDPPControl(DppCtrl)) { - // DPP64 is supported for row_newbcast only. - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - if (Src0Idx >= 0 && - getMRI()->getSubReg(Inst.getOperand(Src0Idx).getReg(), AMDGPU::sub1)) { + if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && + AMDGPU::isDPALU_DPP(MII.get(Opc))) { + // DP ALU DPP is supported for row_newbcast only on GFX9* SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); - Error(S, "64 bit dpp only supports row_newbcast"); + Error(S, "DP ALU dpp only supports row_newbcast"); return false; } } + int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8); + bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0; + + if (IsDPP && !hasDPPSrc1SGPR(getSTI())) { + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx >= 0) { + const MCOperand &Src1 = Inst.getOperand(Src1Idx); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (Src1.isImm() || + (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]); + Error(Op.getStartLoc(), "invalid operand for instruction"); + return false; + } + } + } + return true; } @@ -4241,7 +4397,19 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst, continue; if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { - uint32_t Value = static_cast<uint32_t>(MO.getImm()); + uint64_t Value = static_cast<uint64_t>(MO.getImm()); + bool IsFP64 = AMDGPU::isSISrcFPOperand(Desc, OpIdx) && + AMDGPU::getOperandSize(Desc.operands()[OpIdx]) == 8; + bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP64); + + if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) { + Error(getLitLoc(Operands), "invalid operand for instruction"); + return false; + } + + if (IsFP64 && IsValid32Op) + Value = Hi_32(Value); + if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; @@ -4359,7 +4527,7 @@ bool AMDGPUAsmParser::validateBLGP(const MCInst &Inst, SMLoc BLGPLoc = getBLGPLoc(Operands); if (!BLGPLoc.isValid()) return true; - bool IsNeg = StringRef(BLGPLoc.getPointer()).startswith("neg:"); + bool IsNeg = StringRef(BLGPLoc.getPointer()).starts_with("neg:"); auto FB = getFeatureBits(); bool UsesNeg = false; if (FB[AMDGPU::FeatureGFX940Insts]) { @@ -4405,6 +4573,29 @@ bool AMDGPUAsmParser::validateWaitCnt(const MCInst &Inst, return false; } +bool AMDGPUAsmParser::validateDS(const MCInst &Inst, + const OperandVector &Operands) { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & SIInstrFlags::DS) == 0) + return true; + if (TSFlags & SIInstrFlags::GWS) + return validateGWS(Inst, Operands); + // Only validate GDS for non-GWS instructions. + if (hasGDS()) + return true; + int GDSIdx = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::gds); + if (GDSIdx < 0) + return true; + unsigned GDS = Inst.getOperand(GDSIdx).getImm(); + if (GDS) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyGDS, Operands); + Error(S, "gds modifier is not supported on this GPU"); + return false; + } + return true; +} + // gfx90a has an undocumented limitation: // DS_GWS opcodes must use even aligned registers. bool AMDGPUAsmParser::validateGWS(const MCInst &Inst, @@ -4443,6 +4634,9 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); + if (isGFX12Plus()) + return validateTHAndScopeBits(Inst, Operands, CPol); + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; if (TSFlags & SIInstrFlags::SMRD) { if (CPol && (isSI() || isCI())) { @@ -4457,11 +4651,17 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, } if (isGFX90A() && !isGFX940() && (CPol & CPol::SCC)) { - SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); - StringRef CStr(S.getPointer()); - S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]); - Error(S, "scc is not supported on this GPU"); - return false; + const uint64_t AllowSCCModifier = SIInstrFlags::MUBUF | + SIInstrFlags::MTBUF | SIInstrFlags::MIMG | + SIInstrFlags::FLAT; + if (!(TSFlags & AllowSCCModifier)) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]); + Error(S, + "scc modifier is not supported for this instruction on this GPU"); + return false; + } } if (!(TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet))) @@ -4488,6 +4688,60 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst, + const OperandVector &Operands, + const unsigned CPol) { + const unsigned TH = CPol & AMDGPU::CPol::TH; + const unsigned Scope = CPol & AMDGPU::CPol::SCOPE; + + const unsigned Opcode = Inst.getOpcode(); + const MCInstrDesc &TID = MII.get(Opcode); + + auto PrintError = [&](StringRef Msg) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + Error(S, Msg); + return false; + }; + + if ((TID.TSFlags & SIInstrFlags::IsAtomicRet) && + (TID.TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF)) && + (!(TH & AMDGPU::CPol::TH_ATOMIC_RETURN))) + return PrintError("instruction must use th:TH_ATOMIC_RETURN"); + + if (TH == 0) + return true; + + if ((TID.TSFlags & SIInstrFlags::SMRD) && + ((TH == AMDGPU::CPol::TH_NT_RT) || (TH == AMDGPU::CPol::TH_RT_NT) || + (TH == AMDGPU::CPol::TH_NT_HT))) + return PrintError("invalid th value for SMEM instruction"); + + if (TH == AMDGPU::CPol::TH_BYPASS) { + if ((Scope != AMDGPU::CPol::SCOPE_SYS && + CPol & AMDGPU::CPol::TH_REAL_BYPASS) || + (Scope == AMDGPU::CPol::SCOPE_SYS && + !(CPol & AMDGPU::CPol::TH_REAL_BYPASS))) + return PrintError("scope and th combination is not valid"); + } + + bool IsStore = TID.mayStore(); + bool IsAtomic = + TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet); + + if (IsAtomic) { + if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC)) + return PrintError("invalid th value for atomic instructions"); + } else if (IsStore) { + if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE)) + return PrintError("invalid th value for store instructions"); + } else { + if (!(CPol & AMDGPU::CPol::TH_TYPE_LOAD)) + return PrintError("invalid th value for load instructions"); + } + + return true; +} + bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) { if (!isGFX11Plus()) return true; @@ -4582,10 +4836,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMovrels(Inst, Operands)) { return false; } - if (!validateFlatOffset(Inst, Operands)) { - return false; - } - if (!validateSMEMOffset(Inst, Operands)) { + if (!validateOffset(Inst, Operands)) { return false; } if (!validateMAIAccWrite(Inst, Operands)) { @@ -4613,7 +4864,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid register class: vgpr tuples must be 64 bit aligned"); return false; } - if (!validateGWS(Inst, Operands)) { + if (!validateDS(Inst, Operands)) { return false; } @@ -4888,7 +5139,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) return TokError("directive only supported for amdgcn architecture"); - if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) + if (!isHsaAbi(getSTI())) return TokError("directive only supported for amdhsa OS"); StringRef KernelName; @@ -4905,6 +5156,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { uint64_t NextFreeVGPR = 0; uint64_t AccumOffset = 0; uint64_t SharedVGPRCount = 0; + uint64_t PreloadLength = 0; + uint64_t PreloadOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; @@ -4973,6 +5226,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { Val, ValRange); if (Val) ImpliedUserSGPRCount += 4; + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") { + if (!hasKernargPreload()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + + if (Val > getMaxNumUserSGPRs()) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val, + ValRange); + if (Val) { + ImpliedUserSGPRCount += Val; + PreloadLength = Val; + } + } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") { + if (!hasKernargPreload()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + + if (Val >= 1024) + return OutOfRangeError(ValRange); + PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val, + ValRange); + if (Val) + PreloadOffset = Val; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, @@ -5104,15 +5379,21 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val, ValRange); } else if (ID == ".amdhsa_dx10_clamp") { + if (IVersion.Major >= 12) + return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, - COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange); + COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Val, + ValRange); } else if (ID == ".amdhsa_ieee_mode") { - PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, - Val, ValRange); + if (IVersion.Major >= 12) + return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Val, + ValRange); } else if (ID == ".amdhsa_fp16_overflow") { if (IVersion.Major < 9) return Error(IDRange.Start, "directive requires gfx9+", IDRange); - PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, Val, ValRange); } else if (ID == ".amdhsa_tg_split") { if (!isGFX90A()) @@ -5122,17 +5403,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { } else if (ID == ".amdhsa_workgroup_processor_mode") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); - PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val, + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Val, ValRange); } else if (ID == ".amdhsa_memory_ordered") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); - PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val, + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Val, ValRange); } else if (ID == ".amdhsa_forward_progress") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); - PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val, + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val, ValRange); } else if (ID == ".amdhsa_shared_vgpr_count") { if (IVersion.Major < 10) @@ -5171,6 +5452,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO, Val, ValRange); + } else if (ID == ".amdhsa_round_robin_scheduling") { + if (IVersion.Major < 12) + return Error(IDRange.Start, "directive requires gfx12+", IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, Val, + ValRange); } else { return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange); } @@ -5218,6 +5505,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, UserSGPRCount); + if (PreloadLength && KD.kernarg_size && + (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size)) + return TokError("Kernarg preload length + offset is larger than the " + "kernarg segment size"); + if (isGFX90A()) { if (!Seen.contains(".amdhsa_accum_offset")) return TokError(".amdhsa_accum_offset directive is required"); @@ -5319,6 +5611,18 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, } Lex(); + if (ID == "enable_dx10_clamp") { + if (G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) && + isGFX12Plus()) + return TokError("enable_dx10_clamp=1 is not allowed on GFX12+"); + } + + if (ID == "enable_ieee_mode") { + if (G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) && + isGFX12Plus()) + return TokError("enable_ieee_mode=1 is not allowed on GFX12+"); + } + if (ID == "enable_wavefront_size32") { if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) { if (!isGFX10Plus()) @@ -5419,33 +5723,15 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() { } bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { - const char *AssemblerDirectiveBegin; - const char *AssemblerDirectiveEnd; - std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = - isHsaAbiVersion3AndAbove(&getSTI()) - ? std::pair(HSAMD::V3::AssemblerDirectiveBegin, - HSAMD::V3::AssemblerDirectiveEnd) - : std::pair(HSAMD::AssemblerDirectiveBegin, - HSAMD::AssemblerDirectiveEnd); - - if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) { - return Error(getLoc(), - (Twine(AssemblerDirectiveBegin) + Twine(" directive is " - "not available on non-amdhsa OSes")).str()); - } + assert(isHsaAbi(getSTI())); std::string HSAMetadataString; - if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd, - HSAMetadataString)) + if (ParseToEndDirective(HSAMD::V3::AssemblerDirectiveBegin, + HSAMD::V3::AssemblerDirectiveEnd, HSAMetadataString)) return true; - if (isHsaAbiVersion3AndAbove(&getSTI())) { - if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) - return Error(getLoc(), "invalid HSA metadata"); - } else { - if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) - return Error(getLoc(), "invalid HSA metadata"); - } + if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) + return Error(getLoc(), "invalid HSA metadata"); return false; } @@ -5588,7 +5874,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); - if (isHsaAbiVersion3AndAbove(&getSTI())) { + if (isHsaAbi(getSTI())) { if (IDVal == ".amdhsa_kernel") return ParseDirectiveAMDHSAKernel(); @@ -5611,8 +5897,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_amdgpu_isa") return ParseDirectiveISAVersion(); - if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) - return ParseDirectiveHSAMetadata(); + if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) { + return Error(getLoc(), (Twine(HSAMD::AssemblerDirectiveBegin) + + Twine(" directive is " + "not available on non-amdhsa OSes")) + .str()); + } } if (IDVal == ".amdgcn_target") @@ -5753,20 +6043,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { setForcedDPP(false); setForcedSDWA(false); - if (Name.endswith("_e64_dpp")) { + if (Name.ends_with("_e64_dpp")) { setForcedDPP(true); setForcedEncodingSize(64); return Name.substr(0, Name.size() - 8); - } else if (Name.endswith("_e64")) { + } else if (Name.ends_with("_e64")) { setForcedEncodingSize(64); return Name.substr(0, Name.size() - 4); - } else if (Name.endswith("_e32")) { + } else if (Name.ends_with("_e32")) { setForcedEncodingSize(32); return Name.substr(0, Name.size() - 4); - } else if (Name.endswith("_dpp")) { + } else if (Name.ends_with("_dpp")) { setForcedDPP(true); return Name.substr(0, Name.size() - 4); - } else if (Name.endswith("_sdwa")) { + } else if (Name.ends_with("_sdwa")) { setForcedSDWA(true); return Name.substr(0, Name.size() - 5); } @@ -5789,7 +6079,7 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc)); - bool IsMIMG = Name.startswith("image_"); + bool IsMIMG = Name.starts_with("image_"); while (!trySkipToken(AsmToken::EndOfStatement)) { OperandMode Mode = OperandMode_Default; @@ -5929,7 +6219,7 @@ unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const { Disabling = Id.consume_front("no"); - if (isGFX940() && !Mnemo.startswith("s_")) { + if (isGFX940() && !Mnemo.starts_with("s_")) { return StringSwitch<unsigned>(Id) .Case("nt", AMDGPU::CPol::NT) .Case("sc0", AMDGPU::CPol::SC0) @@ -5946,6 +6236,47 @@ unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo, } ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { + if (isGFX12Plus()) { + SMLoc StringLoc = getLoc(); + + int64_t CPolVal = 0; + ParseStatus ResTH = ParseStatus::NoMatch; + ParseStatus ResScope = ParseStatus::NoMatch; + + for (;;) { + if (ResTH.isNoMatch()) { + int64_t TH; + ResTH = parseTH(Operands, TH); + if (ResTH.isFailure()) + return ResTH; + if (ResTH.isSuccess()) { + CPolVal |= TH; + continue; + } + } + + if (ResScope.isNoMatch()) { + int64_t Scope; + ResScope = parseScope(Operands, Scope); + if (ResScope.isFailure()) + return ResScope; + if (ResScope.isSuccess()) { + CPolVal |= Scope; + continue; + } + } + + break; + } + + if (ResTH.isNoMatch() && ResScope.isNoMatch()) + return ParseStatus::NoMatch; + + Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc, + AMDGPUOperand::ImmTyCPol)); + return ParseStatus::Success; + } + StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken(); SMLoc OpLoc = getLoc(); unsigned Enabled = 0, Seen = 0; @@ -5981,6 +6312,95 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { return ParseStatus::Success; } +ParseStatus AMDGPUAsmParser::parseScope(OperandVector &Operands, + int64_t &Scope) { + Scope = AMDGPU::CPol::SCOPE_CU; // default; + + StringRef Value; + SMLoc StringLoc; + ParseStatus Res; + + Res = parseStringWithPrefix("scope", Value, StringLoc); + if (!Res.isSuccess()) + return Res; + + Scope = StringSwitch<int64_t>(Value) + .Case("SCOPE_CU", AMDGPU::CPol::SCOPE_CU) + .Case("SCOPE_SE", AMDGPU::CPol::SCOPE_SE) + .Case("SCOPE_DEV", AMDGPU::CPol::SCOPE_DEV) + .Case("SCOPE_SYS", AMDGPU::CPol::SCOPE_SYS) + .Default(0xffffffff); + + if (Scope == 0xffffffff) + return Error(StringLoc, "invalid scope value"); + + return ParseStatus::Success; +} + +ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) { + TH = AMDGPU::CPol::TH_RT; // default + + StringRef Value; + SMLoc StringLoc; + ParseStatus Res = parseStringWithPrefix("th", Value, StringLoc); + if (!Res.isSuccess()) + return Res; + + if (Value == "TH_DEFAULT") + TH = AMDGPU::CPol::TH_RT; + else if (Value == "TH_STORE_LU" || Value == "TH_LOAD_RT_WB" || + Value == "TH_LOAD_NT_WB") { + return Error(StringLoc, "invalid th value"); + } else if (Value.starts_with("TH_ATOMIC_")) { + Value = Value.drop_front(10); + TH = AMDGPU::CPol::TH_TYPE_ATOMIC; + } else if (Value.starts_with("TH_LOAD_")) { + Value = Value.drop_front(8); + TH = AMDGPU::CPol::TH_TYPE_LOAD; + } else if (Value.starts_with("TH_STORE_")) { + Value = Value.drop_front(9); + TH = AMDGPU::CPol::TH_TYPE_STORE; + } else { + return Error(StringLoc, "invalid th value"); + } + + if (Value == "BYPASS") + TH |= AMDGPU::CPol::TH_REAL_BYPASS; + + if (TH != 0) { + if (TH & AMDGPU::CPol::TH_TYPE_ATOMIC) + TH |= StringSwitch<int64_t>(Value) + .Case("RETURN", AMDGPU::CPol::TH_ATOMIC_RETURN) + .Case("RT", AMDGPU::CPol::TH_RT) + .Case("RT_RETURN", AMDGPU::CPol::TH_ATOMIC_RETURN) + .Case("NT", AMDGPU::CPol::TH_ATOMIC_NT) + .Case("NT_RETURN", AMDGPU::CPol::TH_ATOMIC_NT | + AMDGPU::CPol::TH_ATOMIC_RETURN) + .Case("CASCADE_RT", AMDGPU::CPol::TH_ATOMIC_CASCADE) + .Case("CASCADE_NT", AMDGPU::CPol::TH_ATOMIC_CASCADE | + AMDGPU::CPol::TH_ATOMIC_NT) + .Default(0xffffffff); + else + TH |= StringSwitch<int64_t>(Value) + .Case("RT", AMDGPU::CPol::TH_RT) + .Case("NT", AMDGPU::CPol::TH_NT) + .Case("HT", AMDGPU::CPol::TH_HT) + .Case("LU", AMDGPU::CPol::TH_LU) + .Case("RT_WB", AMDGPU::CPol::TH_RT_WB) + .Case("NT_RT", AMDGPU::CPol::TH_NT_RT) + .Case("RT_NT", AMDGPU::CPol::TH_RT_NT) + .Case("NT_HT", AMDGPU::CPol::TH_NT_HT) + .Case("NT_WB", AMDGPU::CPol::TH_NT_WB) + .Case("BYPASS", AMDGPU::CPol::TH_BYPASS) + .Default(0xffffffff); + } + + if (TH == 0xffffffff) + return Error(StringLoc, "invalid th value"); + + return ParseStatus::Success; +} + static void addOptionalImmOperand( MCInst& Inst, const OperandVector& Operands, AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx, @@ -6382,7 +6802,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); bool Failed = true; - bool Sat = CntName.endswith("_sat"); + bool Sat = CntName.ends_with("_sat"); if (CntName == "vmcnt" || CntName == "vmcnt_sat") { Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt); @@ -6855,7 +7275,7 @@ ParseStatus AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { if (!parseId(Str)) return ParseStatus::NoMatch; - if (!Str.startswith("attr")) + if (!Str.starts_with("attr")) return Error(S, "invalid interpolation attribute"); StringRef Chan = Str.take_back(2); @@ -6946,7 +7366,7 @@ bool AMDGPUAsmParser::trySkipId(const StringRef Pref, const StringRef Id) { if (isToken(AsmToken::Identifier)) { StringRef Tok = getTokenStr(); - if (Tok.startswith(Pref) && Tok.drop_front(Pref.size()) == Id) { + if (Tok.starts_with(Pref) && Tok.drop_front(Pref.size()) == Id) { lex(); return true; } @@ -7578,66 +7998,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, } //===----------------------------------------------------------------------===// -// SMEM -//===----------------------------------------------------------------------===// - -void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; - bool IsAtomicReturn = false; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - if (!Op.isCPol()) - continue; - IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC; - break; - } - - if (!IsAtomicReturn) { - int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode()); - if (NewOpc != -1) - Inst.setOpcode(NewOpc); - } - - IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags & - SIInstrFlags::IsAtomicRet; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - if (IsAtomicReturn && i == 1) - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle the case where soffset is an immediate - if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { - Op.addImmOperands(Inst, 1); - continue; - } - - // Handle tokens like 'offen' which are sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) { - continue; - } - assert(Op.isImm()); - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - if ((int)Inst.getNumOperands() <= - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset)) - addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTySMEMOffsetMod); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); -} - -//===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// @@ -7704,7 +8064,7 @@ void AMDGPUAsmParser::onBeginOfFile() { // TODO: Should try to check code object version from directive??? AMDGPU::getAmdhsaCodeObjectVersion()); - if (isHsaAbiVersion3AndAbove(&getSTI())) + if (isHsaAbi(getSTI())) getTargetStreamer().EmitDirectiveAMDGCNTarget(); } @@ -8155,7 +8515,7 @@ bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) { Token += Suffix; StringRef DimId = Token; - if (DimId.startswith("SQ_RSRC_IMG_")) + if (DimId.starts_with("SQ_RSRC_IMG_")) DimId = DimId.drop_front(12); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId); @@ -8838,3 +9198,9 @@ bool AMDGPUOperand::isWaitVDST() const { bool AMDGPUOperand::isWaitEXP() const { return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm()); } + +//===----------------------------------------------------------------------===// +// Split Barrier +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSplitBarrier() const { return isInlinableImm(MVT::i32); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td index ea1578e30ae8..43d35fa5291c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -12,6 +12,8 @@ def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">; def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; +def BUFSOffset : ComplexPattern<iPTR, 1, "SelectBUFSOffset">; + def BUFAddrKind { int Offset = 0; int OffEn = 1; @@ -152,24 +154,32 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : } class getMTBUFInsDA<list<RegisterClass> vdataList, - list<RegisterClass> vaddrList=[]> { + list<RegisterClass> vaddrList=[], bit hasGFX12Enc> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; - dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, i1imm:$swz); - dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); - dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); -} + dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset), + (ins SCSrc_b32:$soffset)); + dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, + (ins offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz)); -class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { + dag Inputs = !if(!empty(vaddrList), + NonVaddrInputs, + !con((ins vaddrClass:$vaddr), NonVaddrInputs)); + dag ret = !if(!empty(vdataList), + Inputs, + !con((ins vdata_op:$vdata), Inputs)); +} + +class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit hasGFX12Enc> { dag ret = - !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret, - !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, - !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32], hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64], hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64], hasGFX12Enc>.ret, (ins)))))); } @@ -204,12 +214,13 @@ class MTBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, int elems, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo<opName, (outs getLdStRegisterOperand<vdataClass>.ret:$vdata), - getMTBUFIns<addrKindCopy>.ret, + getMTBUFIns<addrKindCopy, [], hasGFX12Enc>.ret, getMTBUFAsmOps<addrKindCopy>.ret, pattern>, MTBUF_SetupAddr<addrKindCopy> { @@ -219,38 +230,45 @@ class MTBUF_Load_Pseudo <string opName, let elements = elems; } -multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, - int elems> { +multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass, + int elems, bit hasGFX12Enc> { - def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>, + def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>, + def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, hasGFX12Enc>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; - def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; - def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; + def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>; + def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>; + def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; - def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; - def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; - def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; + def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>; + def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>; + def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>; + def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>; } } +multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, + int elems> { + defm NAME : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 0>; + defm _VBUFFER : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 1>; +} + class MTBUF_Store_Pseudo <string opName, int addrKind, RegisterClass vdataClass, int elems, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, RegisterClass vdataClassCopy = vdataClass> : MTBUF_Pseudo<opName, (outs), - getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret, + getMTBUFIns<addrKindCopy, [vdataClassCopy], hasGFX12Enc>.ret, getMTBUFAsmOps<addrKindCopy>.ret, pattern>, MTBUF_SetupAddr<addrKindCopy> { @@ -260,27 +278,32 @@ class MTBUF_Store_Pseudo <string opName, let elements = elems; } -multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, - int elems> { +multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass, + int elems, bit hasGFX12Enc> { - def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>, + def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>, + def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, hasGFX12Enc>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; - def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; - def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; + def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>; + def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>; + def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>; - def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>; - def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>; - def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>; + def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>; + def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>; + def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>; + def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>; } } +multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, + int elems> { + defm NAME : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 0>; + defm _VBUFFER : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 1>; +} //===----------------------------------------------------------------------===// // MUBUF classes @@ -381,12 +404,14 @@ class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> { } class getMUBUFInsDA<list<RegisterClass> vdataList, - list<RegisterClass> vaddrList, bit isTFE> { + list<RegisterClass> vaddrList, bit isTFE, bit hasGFX12Enc> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret; - dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, i1imm_0:$swz); + dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); + dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins offset:$offset, CPol_0:$cpol, i1imm_0:$swz)); + dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); } @@ -410,13 +435,13 @@ class getMUBUFElements<ValueType vt> { ); } -class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE> { +class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE, bit hasGFX12Enc> { dag ret = - !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE>.ret, - !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE>.ret, - !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE>.ret, + !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasGFX12Enc>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasGFX12Enc>.ret, (ins)))))); } @@ -456,6 +481,7 @@ class MUBUF_Load_Pseudo <string opName, bit isLds = 0, bit isLdsOpc = 0, bit isTFE = 0, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -463,7 +489,7 @@ class MUBUF_Load_Pseudo <string opName, RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret> : MUBUF_Pseudo<opName, !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), - !con(getMUBUFIns<addrKindCopy, [], isTFE>.ret, + !con(getMUBUFIns<addrKindCopy, [], isTFE, hasGFX12Enc>.ret, !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret, pattern>, @@ -485,50 +511,61 @@ class MUBUF_Load_Pseudo <string opName, let VALU = isLds; } -class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < +class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : GCNPat < (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), (load_vt (inst v4i32:$srsrc, i32:$soffset, i32:$offset)) >; class MUBUF_Addr64_Load_Pat <Instruction inst, ValueType load_vt = i32, - SDPatternOperator ld = null_frag> : Pat < + SDPatternOperator ld = null_frag> : GCNPat < (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))), (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)) >; -multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { +multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { def : MUBUF_Offset_Load_Pat<!cast<Instruction>(BaseInst#"_OFFSET"), load_vt, ld>; def : MUBUF_Addr64_Load_Pat<!cast<Instruction>(BaseInst#"_ADDR64"), load_vt, ld>; } +multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{ + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>; + } + defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>; +} + multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt, - bit TiedDest, bit isLds, bit isTFE> { + bit TiedDest, bit isLds, bit isTFE, bit hasGFX12Enc> { defvar legal_load_vt = !if(!eq(load_vt, v3f16), v4f16, load_vt); - def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE>, + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>, MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; - def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds, 0, isTFE>, + def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>, MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>; } } multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, bit TiedDest = 0, bit isLds = 0> { - defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0>; - if !not(isLds) then - defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1>; + defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>; + defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>; + + if !not(isLds) then { + defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>; + defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>; + } } multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> { @@ -548,18 +585,24 @@ multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName, def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; + + def _VBUFFER_OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>; + def _VBUFFER_OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>; + def _VBUFFER_IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>; + def _VBUFFER_BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>; } class MUBUF_Store_Pseudo <string opName, int addrKind, ValueType store_vt, bit isTFE = 0, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs), - getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE>.ret, + getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE, hasGFX12Enc>.ret, getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret, pattern>, MUBUF_SetupAddr<addrKindCopy> { @@ -572,36 +615,52 @@ class MUBUF_Store_Pseudo <string opName, let tfe = isTFE; } +multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> { + + def : GCNPat < + (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)), + (!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>; + + def : GCNPat < + (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)), + (!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>; +} + +multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>; + } + defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>; +} + multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt, - SDPatternOperator st, bit isTFE> { + bit isTFE, bit hasGFX12Enc> { defvar legal_store_vt = !if(!eq(store_vt, v3f16), v4f16, store_vt); - def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, - [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i32:$offset))]>, + def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, hasGFX12Enc>, MUBUFAddr64Table<0, NAME>; - def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE, - [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i32:$offset))]>, + def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE, hasGFX12Enc>, MUBUFAddr64Table<1, NAME>; - def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>; - def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE>; - def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE>; + def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE, hasGFX12Enc>; + def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE, hasGFX12Enc>; + def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE, hasGFX12Enc>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE>; - def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>; - def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE>; - def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE>; + def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, hasGFX12Enc>; + def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE, hasGFX12Enc>; + def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE, hasGFX12Enc>; + def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE, hasGFX12Enc>; } } -multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32, - SDPatternOperator st = null_frag> { - defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, st, 0>; - defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, null_frag, 1>; +multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> { + defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>; + defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>; + + defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>; + defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>; } class MUBUF_Pseudo_Store_Lds<string opName> @@ -623,15 +682,17 @@ class MUBUF_Pseudo_Store_Lds<string opName> let AsmMatchConverter = "cvtMubuf"; } -class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, +class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasGFX12Enc, list<RegisterClass> vaddrList=[]> { RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata)); dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr))); - dag MainInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); - dag CPol = !if(vdata_in, (ins CPol_GLC1:$cpol), (ins CPol_0:$cpol)); + dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset)); + dag MainInputs = !con((ins SReg_128:$srsrc), SOffset, (ins offset:$offset)); + dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol), + (ins CPol_NonGLC_WithDefault:$cpol)); dag ret = !con(Data, MainInputs, CPol); } @@ -639,19 +700,20 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, class getMUBUFAtomicIns<int addrKind, RegisterClass vdataClass, bit vdata_in, + bit hasGFX12Enc, // Workaround bug bz30254 RegisterClass vdataClassCopy=vdataClass> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret, + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc>.ret, !if(!eq(addrKind, BUFAddrKind.OffEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.IdxEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret, + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VGPR_32]>.ret, !if(!eq(addrKind, BUFAddrKind.BothEn), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VReg_64]>.ret, !if(!eq(addrKind, BUFAddrKind.Addr64), - getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret, + getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VReg_64]>.ret, (ins)))))); } @@ -679,13 +741,14 @@ class MUBUF_Atomic_Pseudo<string opName, class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, RegisterClass vdataClass, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, RegisterClass vdataClassCopy = vdataClass> : MUBUF_Atomic_Pseudo<opName, addrKindCopy, (outs), - getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret, + getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0, hasGFX12Enc>.ret, getMUBUFAsmOps<addrKindCopy>.ret, pattern>, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> { @@ -698,6 +761,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, RegisterClass vdataClass, + bit hasGFX12Enc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -705,7 +769,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : MUBUF_Atomic_Pseudo<opName, addrKindCopy, (outs vdata_op:$vdata), - getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret, + getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1, hasGFX12Enc>.ret, getMUBUFAsmOps<addrKindCopy>.ret, pattern>, AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> { @@ -723,13 +787,21 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, ValueType vdataType, bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in { - def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, + def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, MUBUFAddr64Table <0, NAME>; - def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, + def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>, MUBUFAddr64Table <1, NAME>; - def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>; + def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>; + def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>; + + def _VBUFFER_OFFSET : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>, + MUBUFAddr64Table <0, NAME # "_VBUFFER">; + def _VBUFFER_ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>, + MUBUFAddr64Table <1, NAME # "_VBUFFER">; + def _VBUFFER_OFFEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>; + def _VBUFFER_IDXEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>; + def _VBUFFER_BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>; } } @@ -739,21 +811,37 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, SDPatternOperator atomic, bit isFP = isFloatType<vdataType>.ret> { let FPAtomic = isFP in { - def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0, [(set vdataType:$vdata, (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vdataType:$vdata_in))]>, MUBUFAddr64Table <0, NAME # "_RTN">; - def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0, [(set vdataType:$vdata, (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vdataType:$vdata_in))]>, MUBUFAddr64Table <1, NAME # "_RTN">; - def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>; + def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>; + def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>; + + def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1, + [(set vdataType:$vdata, + (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; + + def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1, + [(set vdataType:$vdata, + (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; + + def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>; + def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>; + def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>; } } @@ -794,7 +882,7 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_xyzw", v4f32 >; -let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { +let OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1 in { let TiedSourceNotRead = 1 in { defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < "buffer_load_format_d16_x", i32 @@ -821,9 +909,9 @@ let TiedSourceNotRead = 1 in { defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < "buffer_store_format_d16_xyzw", v4i32 >; -} // End HasUnpackedD16VMem. +} // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1. -let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { +let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in { let TiedSourceNotRead = 1 in { defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < "buffer_load_format_d16_x", f16 @@ -850,7 +938,7 @@ let TiedSourceNotRead = 1 in { defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_d16_xyzw", v4f16 >; -} // End HasPackedD16VMem. +} // End OtherPredicates = [HasPackedD16VMem], D16Buf = 1. defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < "buffer_load_ubyte", i32 @@ -906,29 +994,61 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; + +foreach vt = Reg32Types.types in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>; +} + +foreach vt = VReg_64.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>; +} + +foreach vt = VReg_96.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>; +} + +foreach vt = VReg_128.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>; +} defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < - "buffer_store_byte", i32, truncstorei8_global + "buffer_store_byte", i32 >; defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < - "buffer_store_short", i32, truncstorei16_global + "buffer_store_short", i32 >; defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < - "buffer_store_dword", i32, store_global + "buffer_store_dword", i32 >; defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx2", v2i32, store_global + "buffer_store_dwordx2", v2i32 >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", v3i32, store_global + "buffer_store_dwordx3", v3i32 >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx4", v4i32, store_global + "buffer_store_dwordx4", v4i32 >; + +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>; +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>; + +foreach vt = Reg32Types.types in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>; +} + +foreach vt = VReg_64.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>; +} + +foreach vt = VReg_96.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>; +} + +foreach vt = VReg_128.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>; +} + defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_swap", VGPR_32, i32 >; @@ -1008,10 +1128,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_dec_x2", VReg_64, i64 >; -let SubtargetPredicate = HasGFX10_BEncoding in -defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub ->; +let OtherPredicates = [HasGFX10_BEncoding] in { + defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics < + "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub + >; +} let SubtargetPredicate = isGFX8GFX9 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; @@ -1198,10 +1319,8 @@ def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> { let AsmOperands = "$cpol"; } -let SubtargetPredicate = isGFX10Plus in { - def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; - def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; -} // End SubtargetPredicate = isGFX10Plus +def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; +def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; //===----------------------------------------------------------------------===// // MUBUF Patterns @@ -1211,33 +1330,33 @@ let SubtargetPredicate = isGFX10Plus in { // buffer_load/store_format patterns //===----------------------------------------------------------------------===// -multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, +multiclass MUBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_load<name, memoryVt>); def : GCNPat< - (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1246,6 +1365,14 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, >; } +multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode, ValueType memoryVt = vt>{ + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>; + } + defm : MUBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; +} + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; @@ -1260,16 +1387,16 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v3i32, "BUFFER_LOAD_FORM defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v4i32, "BUFFER_LOAD_FORMAT_XYZ_TFE">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v5i32, "BUFFER_LOAD_FORMAT_XYZW_TFE">; -let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; +let OtherPredicates = [HasUnpackedD16VMem] in { + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">; + defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let SubtargetPredicate = HasPackedD16VMem in { +let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">; @@ -1298,33 +1425,33 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">; -multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, +multiclass MUBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>); def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) getVregSrcForVT<vt>.ret:$vdata, @@ -1334,6 +1461,14 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; } +multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode, ValueType memoryVt = vt> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>; + } + defm : MUBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; +} + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; @@ -1343,16 +1478,16 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMA defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">; -let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">; +let OtherPredicates = [HasUnpackedD16VMem] in { + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">; + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">; + defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let SubtargetPredicate = HasPackedD16VMem in { +let OtherPredicates = [HasPackedD16VMem] in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">; @@ -1383,7 +1518,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; // buffer_atomic patterns //===----------------------------------------------------------------------===// -multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { +multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { foreach RtnMode = ["ret", "noret"] in { defvar Op = !cast<SDPatternOperator>(OpPrefix @@ -1409,11 +1544,18 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt } // end foreach RtnMode } +multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : BufferAtomicPat_Common<OpPrefix, vt, Inst, isIntr>; + } + defm : BufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", isIntr>; +} + multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> { defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>; } -multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { +multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" @@ -1449,6 +1591,14 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> } // end foreach RtnMode } +multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : BufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>; + } + defm : BufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">; +} + + foreach Ty = [i32, i64] in { defvar Suffix = !if(!eq(Ty, i64), "_X2", ""); @@ -1471,7 +1621,7 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">; defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">; -multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, +multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { foreach RtnMode = RtnModes in { @@ -1484,7 +1634,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< - (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, @@ -1492,7 +1642,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, >; def : GCNPat< - (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, @@ -1501,7 +1651,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, - i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), + (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) @@ -1509,7 +1659,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, - i32:$soffset, timm:$offset, timm:$cachepolicy, timm)), + (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1520,6 +1670,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, } // end foreach RtnMode } +multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, + list<string> RtnModes = ["ret", "noret"]> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>; + } + defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>; +} + defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i32, "BUFFER_ATOMIC_SWAP">; defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", f32, "BUFFER_ATOMIC_SWAP">; defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i32, "BUFFER_ATOMIC_ADD">; @@ -1547,6 +1705,9 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in +defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; @@ -1562,11 +1723,11 @@ class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag < let HasNoUse = true; } -multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, +multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, + 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) @@ -1574,7 +1735,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, + 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) @@ -1582,7 +1743,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, + i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) @@ -1590,7 +1751,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, def : GCNPat< (NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, + i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) getVregSrcForVT<vt>.ret:$vdata_in, @@ -1599,87 +1760,111 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, >; } -let SubtargetPredicate = HasAtomicFaddNoRtnInsts in -defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; +multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, + string opcode> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode>; + } + defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">; +} -let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in -defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; -let SubtargetPredicate = HasAtomicFaddRtnInsts in -defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; +} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] -let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in -defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +let OtherPredicates = [HasAtomicFaddRtnInsts] in + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; -let SubtargetPredicate = isGFX90APlus in { +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] + +let OtherPredicates = [isGFX90APlus] in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; } // End SubtargetPredicate = isGFX90APlus -foreach RtnMode = ["ret", "noret"] in { - -defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap - # !if(!eq(RtnMode, "ret"), "", "_noret")); -defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); -defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), - (timm:$cachepolicy)); - -defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix) - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); -def : GCNPat< - (Op - i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, - timm:$offset, timm:$cachepolicy, 0), - !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0), - OffsetResDag) ->; - -defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix) - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, - CachePolicy); -def : GCNPat< - (Op - i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), - !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0), - IdxenResDag) ->; - -defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix) - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, - CachePolicy); -def : GCNPat< - (Op - i32:$data, i32:$cmp, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), - !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0), - OffenResDag) ->; - -defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix) - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); -def : GCNPat< - (Op - i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), - !if(!eq(RtnMode, "ret"), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0), - BothenResDag) ->; - -} // end foreach RtnMode +multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> { + foreach RtnMode = ["ret", "noret"] in { + defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap + # !if(!eq(RtnMode, "ret"), "", "_noret")); + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), + (timm:$cachepolicy)); + defvar SrcRC = getVregSrcForVT<vt>.ret; + defvar DataRC = getVregSrcForVT<data_vt>.ret; + defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1); + defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3); + + defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) + (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); + def : GCNPat< + (vt (Op + vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), + timm:$offset, timm:$cachepolicy, 0)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG OffsetResDag, SubLo), + OffsetResDag) + >; + + defvar IdxenResDag = (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix) + (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, + CachePolicy); + def : GCNPat< + (vt (Op + vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex, + 0, (BUFSOffset i32:$soffset), timm:$offset, + timm:$cachepolicy, timm)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG IdxenResDag, SubLo), + IdxenResDag) + >; + + defvar OffenResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix) + (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, + CachePolicy); + def : GCNPat< + (vt (Op + vt:$data, vt:$cmp, v4i32:$rsrc, 0, + i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, + timm:$cachepolicy, 0)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG OffenResDag, SubLo), + OffenResDag) + >; + + defvar BothenResDag = (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix) + (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi), + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); + def : GCNPat< + (vt (Op + vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex, + i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, + timm:$cachepolicy, timm)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG BothenResDag, SubLo), + BothenResDag) + >; + } // end foreach RtnMode +} + +multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>; + } + defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">; +} + +defm : SIBufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">; +defm : SIBufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">; class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < @@ -1713,105 +1898,125 @@ defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFF defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, atomic_load_64_global>; } // End SubtargetPredicate = isGFX6GFX7 -multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, +multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt, PatFrag ld> { - def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), - (Instr_OFFSET $srsrc, $soffset, $offset) + (!cast<MUBUF_Pseudo>(Instr # "_OFFSET") $srsrc, $soffset, $offset) >; } +multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt, + PatFrag ld> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>; + } + defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>; +} + let OtherPredicates = [Has16BitInsts] in { -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_global>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_global>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, load_global>; +defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>; } // End OtherPredicates = [Has16BitInsts] -multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, - MUBUF_Pseudo InstrOffset, +multiclass MUBUFScratchLoadPat_Common <string Instr, ValueType vt, PatFrag ld> { def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0) + (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0) + (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset, 0, 0) >; } +multiclass MUBUFScratchLoadPat <string Instr, + ValueType vt, PatFrag ld> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFScratchLoadPat_Common<Instr, vt, ld>; + } + defm : MUBUFScratchLoadPat_Common<Instr # "_VBUFFER", vt, ld>; +} + // XXX - Is it possible to have a complex pattern in a PatFrag? -multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, - MUBUF_Pseudo InstrOffset, +multiclass MUBUFScratchLoadPat_D16_Common <string Instr, ValueType vt, PatFrag ld_frag> { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in) + (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, $in) + (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset, $in) >; } +multiclass MUBUFScratchLoadPat_D16 <string Instr, + ValueType vt, PatFrag ld_frag> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFScratchLoadPat_D16_Common<Instr, vt, ld_frag>; + } + defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>; +} + let OtherPredicates = [DisableFlatScratch] in { -defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>; foreach vt = Reg32Types.types in { -defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, vt, load_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>; } -defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX2", v2i32, load_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX3", v3i32, load_private>; +defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX4", v4i32, load_private>; let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2i16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2i16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2i16, sextloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2f16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2f16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2f16, sextloadi8_d16_hi_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>; -defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16", v2i16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2i16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2i16, sextloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16", v2f16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2f16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2f16, sextloadi8_d16_lo_private>; } } // End OtherPredicates = [DisableFlatScratch] multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { - // Store follows atomic op convention so address is first def : GCNPat < - (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vt:$val), + (atomic_st vt:$val, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)), (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset), vt:$val), + (atomic_st vt:$val, (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset)), (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset)) >; } @@ -1825,56 +2030,72 @@ defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWOR } // End Predicates = isGFX6GFX7 -multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, - PatFrag st> { +multiclass MUBUFStore_PatternOffset_Common <string Instr, ValueType vt, + PatFrag st> { def : GCNPat < (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset) + (!cast<MUBUF_Pseudo>(Instr # "_OFFSET") $vdata, $srsrc, $soffset, $offset) >; } -defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>; +multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt, + PatFrag st> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFStore_PatternOffset_Common<Instr, vt, st>; + } + defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>; +} + +defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>; +defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>; -multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, - MUBUF_Pseudo InstrOffset, +multiclass MUBUFScratchStorePat_Common <string Instr, ValueType vt, PatFrag st, RegisterClass rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset)), - (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0) + (!cast<MUBUF_Pseudo>(Instr # _OFFEN) rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0) + (!cast<MUBUF_Pseudo>(Instr # _OFFSET) rc:$value, $srsrc, $soffset, $offset, 0, 0) >; } +multiclass MUBUFScratchStorePat <string Instr, + ValueType vt, PatFrag st, + RegisterClass rc = VGPR_32> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>; + } + defm : MUBUFScratchStorePat_Common<Instr # "_VBUFFER", vt, st, rc>; +} + let OtherPredicates = [DisableFlatScratch] in { -defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>; foreach vt = Reg32Types.types in { -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>; } -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>; -defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX2", v2i32, store_private, VReg_64>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX3", v3i32, store_private, VReg_96>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX4", v4i32, store_private, VReg_128>; let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { -defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT_D16_HI_OFFSET, i32, store_hi16_private>; -defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_D16_HI", i32, store_hi16_private>; +defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_D16_HI", i32, truncstorei8_hi16_private>; } } } // End OtherPredicates = [DisableFlatScratch] @@ -1887,12 +2108,12 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D // tbuffer_load/store_format patterns //===----------------------------------------------------------------------===// -multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, +multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_load<name, memoryVt>); def : GCNPat< - (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1900,7 +2121,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1908,7 +2129,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1916,7 +2137,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, + (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), @@ -1926,6 +2147,14 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, >; } +multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode, ValueType memoryVt = vt> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>; + } + defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; +} + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">; @@ -1935,15 +2164,15 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; -let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; +let OtherPredicates = [HasUnpackedD16VMem] in { + defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; + defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">; + defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let SubtargetPredicate = HasPackedD16VMem in { +let OtherPredicates = [HasPackedD16VMem] in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; @@ -1951,12 +2180,12 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. -multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, +multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt, string opcode, ValueType memoryVt = vt> { defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_store<name, memoryVt>); def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1964,7 +2193,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1972,7 +2201,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, + (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), @@ -1980,7 +2209,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; def : GCNPat< - (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, + (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) getVregSrcForVT<vt>.ret:$vdata, @@ -1990,6 +2219,14 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; } +multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode, ValueType memoryVt = vt> { + let SubtargetPredicate = HasUnrestrictedSOffset in { + defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>; + } + defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>; +} + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">; @@ -1999,15 +2236,15 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY" defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; -let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; +let OtherPredicates = [HasUnpackedD16VMem] in { + defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; + defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">; + defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. -let SubtargetPredicate = HasPackedD16VMem in { +let OtherPredicates = [HasPackedD16VMem] in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; @@ -2044,6 +2281,7 @@ class MUBUF_Real_gfx11<bits<8> op, MUBUF_Pseudo ps, let Inst{53} = ps.tfe; let Inst{54} = ps.offen; let Inst{55} = ps.idxen; + let SubtargetPredicate = isGFX11Only; } class Base_MUBUF_Real_Atomic_gfx11<bits<8> op, MUBUF_Pseudo ps, @@ -2067,15 +2305,98 @@ class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> { let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); let Inst{25} = op{7}; + let SubtargetPredicate = isGFX10Only; } class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> : Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> { let Inst{15} = ps.addr64; + let SubtargetPredicate = isGFX6GFX7; +} + +//===----------------------------------------------------------------------===// +// Base ENC_VBUFFER for GFX12. +//===----------------------------------------------------------------------===// + +class VBUFFER_Real <BUF_Pseudo ps, string real_name = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>, Enc96 { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let VM_CNT = 1; + let EXP_CNT = 1; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let OtherPredicates = ps.OtherPredicates; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; + let VALU = ps.VALU; + let LGKM_CNT = ps.LGKM_CNT; + + bits<24> offset; + bits<8> vaddr; + bits<10> vdata; + + bits<7> srsrc; + bits<7> soffset; + bits<6> cpol; + + let Inst{95-72} = !if(ps.has_offset, offset, ?); + let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?); + let Inst{39-32} = !if(ps.has_vdata, vdata{7-0}, ?); + + let Inst{47-41} = !if(ps.has_srsrc, srsrc, ?); + let Inst{49-48} = 0b00; + let Inst{6-0} = !if(ps.has_soffset, soffset, ?); + let Inst{22} = ps.tfe; + let Inst{62} = ps.offen; + let Inst{63} = ps.idxen; + + let Inst{54-53} = cpol{2-1}; // th{2-1} + let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} + let Inst{51-50} = cpol{4-3}; // scope + + let Inst{31-26} = 0b110001; +} + +class VBUFFER_MUBUF_Real_gfx12<bits<8> op, MUBUF_Pseudo ps, + string real_name = ps.Mnemonic> : + VBUFFER_Real<ps, real_name>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX12> { + + let MUBUF = 1; + + // Set the last bit of format to 1 to avoid round-trip issues, as some tools + // print BUF_FMT_INVALID for format 0. + let Inst{55} = 0b1; + let Inst{21-14} = op; + let SubtargetPredicate = isGFX12Only; +} + +class VBUFFER_MTBUF_Real_gfx12<bits<4> op, MTBUF_Pseudo ps, + string real_name = ps.Mnemonic> : + VBUFFER_Real<ps, real_name>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX12> { + + let MTBUF = 1; + + bits<7> format; + + let Inst{17-14} = op; + let Inst{21-18} = 0b1000; + let Inst{61-55} = format; } //===----------------------------------------------------------------------===// -// MUBUF - GFX11. +// MUBUF - GFX11, GFX12. //===----------------------------------------------------------------------===// // Shortcut to default Mnemonic from MUBUF_Pseudo. Hides the cast to the @@ -2085,19 +2406,43 @@ class get_MUBUF_ps<string name> { } // gfx11 instruction that accept both old and new assembler name. -class Pre_gfx11_MUBUF_Name <string mnemonic, string real_name> : +class Mnem_gfx11_gfx12 <string mnemonic, string real_name> : MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Plus]>; +class Mnem_gfx11 <string mnemonic, string real_name> : + MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Only]>; + +class Mnem_gfx12 <string mnemonic, string real_name> : + MnemonicAlias<mnemonic, real_name>, Requires<[isGFX12Plus]>; + class MUBUF_Real_gfx11_impl<bits<8> op, string ps_name, string real_name> : MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(ps_name), real_name>; -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in + +class VBUFFER_MUBUF_Real_gfx12_impl<bits<8> op, string ps_name, string real_name> : + VBUFFER_MUBUF_Real_gfx12<op, !cast<MUBUF_Pseudo>(ps_name), real_name>; + multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<bits<8> op, string real_name> { - def _BOTHEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_BOTHEN", real_name>; - def _IDXEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_IDXEN", real_name>; - def _OFFEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFEN", real_name>; - def _OFFSET_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFSET", real_name>; + let DecoderNamespace = "GFX11" in { + def _BOTHEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_BOTHEN", real_name>; + def _IDXEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_IDXEN", real_name>; + def _OFFEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFEN", real_name>; + def _OFFSET_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFSET", real_name>; + } } +multiclass MUBUF_Real_AllAddr_gfx12_Renamed_Impl2<bits<8> op, string real_name> { + let DecoderNamespace = "GFX12" in { + def _BOTHEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_BOTHEN", real_name>; + def _IDXEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_IDXEN", real_name>; + def _OFFEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_OFFEN", real_name>; + def _OFFSET_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_OFFSET", real_name>; + } +} + +multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<bits<8> op, string real_name> : + MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>, + MUBUF_Real_AllAddr_gfx12_Renamed_Impl2<op, real_name>; + multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name, bit hasTFE = 1> { defm NAME : MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>; @@ -2105,136 +2450,196 @@ multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name, defm _TFE : MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>; } -// Non-renamed, non-atomic gfx11 mubuf instructions. +multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<bits<8> op, string real_name, + bit hasTFE = 1> { + defm NAME : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<op, real_name>; + if hasTFE then + defm _TFE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<op, real_name>; +} + +// Non-renamed, non-atomic gfx11/gfx12 mubuf instructions. multiclass MUBUF_Real_AllAddr_gfx11<bits<8> op, bit hasTFE = 1> : MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, get_MUBUF_ps<NAME>.Mnemonic, hasTFE>; -multiclass MUBUF_Real_AllAddr_gfx11_Renamed<bits<8> op, string real_name> : - MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> { - def : Pre_gfx11_MUBUF_Name<get_MUBUF_ps<NAME>.Mnemonic, real_name>; +multiclass MUBUF_Real_AllAddr_gfx11_gfx12<bits<8> op, bit hasTFE = 1> : + MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, get_MUBUF_ps<NAME>.Mnemonic, hasTFE>; + +multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, real_name> { + def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>; } class MUBUF_Real_Atomic_gfx11_impl<bits<8> op, string ps_name, string real_name> : Base_MUBUF_Real_Atomic_gfx11<op, !cast<MUBUF_Pseudo>(ps_name), real_name>; -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in + +class MUBUF_Real_Atomic_gfx12_impl<bits<8> op, string ps_name, + string real_name> : + VBUFFER_MUBUF_Real_gfx12<op, !cast<MUBUF_Pseudo>(ps_name), real_name>; + multiclass MUBUF_Real_Atomic_gfx11_Renamed_impl<bits<8> op, bit is_return, string real_name> { - defvar Rtn = !if(!eq(is_return, 1), "_RTN", ""); - def _BOTHEN#Rtn#_gfx11 : - MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_BOTHEN" # Rtn, real_name>, - AtomicNoRet<NAME # "_BOTHEN_gfx11", is_return>; - def _IDXEN#Rtn#_gfx11 : - MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_IDXEN" # Rtn, real_name>, - AtomicNoRet<NAME # "_IDXEN_gfx11", is_return>; - def _OFFEN#Rtn#_gfx11 : - MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFEN" # Rtn, real_name>, - AtomicNoRet<NAME # "_OFFEN_gfx11", is_return>; - def _OFFSET#Rtn#_gfx11 : - MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFSET" # Rtn, real_name>, - AtomicNoRet<NAME # "_OFFSET_gfx11", is_return>; -} - -// Non-renamed gfx11 mubuf atomic. -multiclass MUBUF_Real_Atomic_gfx11<bits<8> op> : - MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>, - MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>; + let DecoderNamespace = "GFX11" in { + defvar Rtn = !if(!eq(is_return, 1), "_RTN", ""); + def _BOTHEN#Rtn#_gfx11 : + MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_BOTHEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_BOTHEN_gfx11", is_return>; + def _IDXEN#Rtn#_gfx11 : + MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_IDXEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_IDXEN_gfx11", is_return>; + def _OFFEN#Rtn#_gfx11 : + MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_OFFEN_gfx11", is_return>; + def _OFFSET#Rtn#_gfx11 : + MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFSET" # Rtn, real_name>, + AtomicNoRet<NAME # "_OFFSET_gfx11", is_return>; + } +} + +multiclass MUBUF_Real_Atomic_gfx12_Renamed_impl<bits<8> op, bit is_return, + string real_name> { + let DecoderNamespace = "GFX12" in { + defvar Rtn = !if(!eq(is_return, 1), "_RTN", ""); + def _BOTHEN#Rtn#_gfx12 : + MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_BOTHEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_BOTHEN_gfx12", is_return>; + def _IDXEN#Rtn#_gfx12 : + MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_IDXEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_IDXEN_gfx12", is_return>; + def _OFFEN#Rtn#_gfx12 : + MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_OFFEN" # Rtn, real_name>, + AtomicNoRet<NAME # "_OFFEN_gfx12", is_return>; + def _OFFSET#Rtn#_gfx12 : + MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_OFFSET" # Rtn, real_name>, + AtomicNoRet<NAME # "_OFFSET_gfx12", is_return>; + } +} + +multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<bits<8> op, bit is_return, + string real_name> : + MUBUF_Real_Atomic_gfx11_Renamed_impl<op, is_return, real_name>, + MUBUF_Real_Atomic_gfx12_Renamed_impl<op, is_return, real_name>; + +// Non-renamed gfx11/gfx12 mubuf atomic. +multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op> : + MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>, + MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>; + +multiclass MUBUF_Real_Atomic_gfx12<bits<8> op> : + MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>, + MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>; multiclass MUBUF_Real_Atomic_gfx11_Renamed<bits<8> op, string real_name> : MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, real_name>, - MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, real_name> { - def : Pre_gfx11_MUBUF_Name<get_MUBUF_ps<NAME>.Mnemonic, real_name>; + MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, real_name> { + def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>; +} + +multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed<bits<8> op, string real_name> : + MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 0, real_name>, + MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 1, real_name> { + def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>; } -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { +multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<bits<8> op, string gfx12_name, string gfx11_name> : + MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, gfx11_name>, + MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, gfx11_name>, + MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 0, gfx12_name>, + MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 1, gfx12_name> { + def : Mnem_gfx11<get_MUBUF_ps<NAME>.Mnemonic, gfx11_name>; + def : Mnem_gfx12<get_MUBUF_ps<NAME>.Mnemonic, gfx12_name>; + def : Mnem_gfx12<gfx11_name, gfx12_name>; +} + +let DecoderNamespace = "GFX11" in { def BUFFER_GL0_INV_gfx11 : MUBUF_Real_gfx11<0x02B, BUFFER_GL0_INV>; def BUFFER_GL1_INV_gfx11 : MUBUF_Real_gfx11<0x02C, BUFFER_GL1_INV>; } -defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x014, "buffer_load_b32">; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x015, "buffer_load_b64">; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x016, "buffer_load_b96">; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x017, "buffer_load_b128">; -defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x020, "buffer_load_d16_b16">; -defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x008, "buffer_load_d16_format_x">; -defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x009, "buffer_load_d16_format_xy">; -defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00a, "buffer_load_d16_format_xyz">; -defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00b, "buffer_load_d16_format_xyzw">; -defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x023, "buffer_load_d16_hi_b16">; -defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x026, "buffer_load_d16_hi_format_x">; -defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x022, "buffer_load_d16_hi_i8">; -defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x021, "buffer_load_d16_hi_u8">; -defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01f, "buffer_load_d16_i8">; -defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01e, "buffer_load_d16_u8">; -defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x000>; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x001>; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x002>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x003>; -defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x011, "buffer_load_i8">; -defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x013, "buffer_load_i16">; -defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x010, "buffer_load_u8">; -defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x012, "buffer_load_u16">; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x014, "buffer_load_b32">; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x015, "buffer_load_b64">; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x016, "buffer_load_b96">; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x017, "buffer_load_b128">; +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x020, "buffer_load_d16_b16">; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x008, "buffer_load_d16_format_x">; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x009, "buffer_load_d16_format_xy">; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00a, "buffer_load_d16_format_xyz">; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00b, "buffer_load_d16_format_xyzw">; +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x023, "buffer_load_d16_hi_b16">; +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x026, "buffer_load_d16_hi_format_x">; +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x022, "buffer_load_d16_hi_i8">; +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x021, "buffer_load_d16_hi_u8">; +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01f, "buffer_load_d16_i8">; +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01e, "buffer_load_d16_u8">; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11_gfx12<0x000>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11_gfx12<0x001>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12<0x002>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12<0x003>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x011, "buffer_load_i8">; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x013, "buffer_load_i16">; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x010, "buffer_load_u8">; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x012, "buffer_load_u16">; defm BUFFER_LOAD_LDS_B32 : MUBUF_Real_AllAddr_gfx11<0x031, 0>; defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x032, 0>; defm BUFFER_LOAD_LDS_I8 : MUBUF_Real_AllAddr_gfx11<0x02e, 0>; defm BUFFER_LOAD_LDS_I16 : MUBUF_Real_AllAddr_gfx11<0x030, 0>; defm BUFFER_LOAD_LDS_U8 : MUBUF_Real_AllAddr_gfx11<0x02d, 0>; defm BUFFER_LOAD_LDS_U16 : MUBUF_Real_AllAddr_gfx11<0x02f, 0>; -defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x018, "buffer_store_b8">; -defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x019, "buffer_store_b16">; -defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x01A, "buffer_store_b32">; -defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01B, "buffer_store_b64">; -defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01C, "buffer_store_b96">; -defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01D, "buffer_store_b128">; -defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x00C, "buffer_store_d16_format_x">; -defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x00D, "buffer_store_d16_format_xy">; -defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00E, "buffer_store_d16_format_xyz">; -defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00F, "buffer_store_d16_format_xyzw">; -defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x024, "buffer_store_d16_hi_b8">; -defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x025, "buffer_store_d16_hi_b16">; -defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x027, "buffer_store_d16_hi_format_x">; -defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x004>; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x005>; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x006>; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x007>; -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_gfx11<0x056>; -defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_gfx11_Renamed<0x035, "buffer_atomic_add_u32">; -defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x043, "buffer_atomic_add_u64">; -defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_gfx11_Renamed<0x03C, "buffer_atomic_and_b32">; -defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x049, "buffer_atomic_and_b64">; -defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x034, "buffer_atomic_cmpswap_b32">; -defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x042, "buffer_atomic_cmpswap_b64">; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x018, "buffer_store_b8">; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x019, "buffer_store_b16">; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01A, "buffer_store_b32">; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01B, "buffer_store_b64">; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01C, "buffer_store_b96">; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01D, "buffer_store_b128">; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00C, "buffer_store_d16_format_x">; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00D, "buffer_store_d16_format_xy">; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00E, "buffer_store_d16_format_xyz">; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00F, "buffer_store_d16_format_xyzw">; +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x024, "buffer_store_d16_hi_b8">; +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x025, "buffer_store_d16_hi_b16">; +defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x027, "buffer_store_d16_hi_format_x">; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11_gfx12<0x004>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11_gfx12<0x005>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12<0x006>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12<0x007>; +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_gfx11_gfx12<0x056>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x035, "buffer_atomic_add_u32">; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x043, "buffer_atomic_add_u64">; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03C, "buffer_atomic_and_b32">; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049, "buffer_atomic_and_b64">; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">; -defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_Renamed_impl<0x037, 1, "buffer_atomic_csub_u32">; -def : Pre_gfx11_MUBUF_Name<"buffer_atomic_csub", "buffer_atomic_csub_u32">; -defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_Renamed<0x040, "buffer_atomic_dec_u32">; -defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04D, "buffer_atomic_dec_u64">; -defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_gfx11_Renamed<0x03F, "buffer_atomic_inc_u32">; -defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04C, "buffer_atomic_inc_u64">; -defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x052, "buffer_atomic_max_f32">; -defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x03A, "buffer_atomic_max_i32">; -defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x047, "buffer_atomic_max_i64">; -defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x03B, "buffer_atomic_max_u32">; -defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x048, "buffer_atomic_max_u64">; -defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x051, "buffer_atomic_min_f32">; -defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x038, "buffer_atomic_min_i32">; -defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x045, "buffer_atomic_min_i64">; -defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x039, "buffer_atomic_min_u32">; -defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x046, "buffer_atomic_min_u64">; -defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_gfx11_Renamed<0x03D, "buffer_atomic_or_b32">; -defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04A, "buffer_atomic_or_b64">; -defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_gfx11_Renamed<0x036, "buffer_atomic_sub_u32">; -defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x044, "buffer_atomic_sub_u64">; -defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x033, "buffer_atomic_swap_b32">; -defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x041, "buffer_atomic_swap_b64">; -defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_Renamed<0x03E, "buffer_atomic_xor_b32">; -defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04B, "buffer_atomic_xor_b64">; +defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">; +def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04D, "buffer_atomic_dec_u64">; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03F, "buffer_atomic_inc_u32">; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04C, "buffer_atomic_inc_u64">; +defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x052, "buffer_atomic_max_num_f32", "buffer_atomic_max_f32">; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03A, "buffer_atomic_max_i32">; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x047, "buffer_atomic_max_i64">; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03B, "buffer_atomic_max_u32">; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x048, "buffer_atomic_max_u64">; +defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x051, "buffer_atomic_min_num_f32", "buffer_atomic_min_f32">; +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x038, "buffer_atomic_min_i32">; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x045, "buffer_atomic_min_i64">; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x039, "buffer_atomic_min_u32">; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x046, "buffer_atomic_min_u64">; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03D, "buffer_atomic_or_b32">; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04A, "buffer_atomic_or_b64">; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x036, "buffer_atomic_sub_u32">; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x044, "buffer_atomic_sub_u64">; +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033, "buffer_atomic_swap_b32">; +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">; //===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { +let DecoderNamespace = "GFX10" in { multiclass MUBUF_Real_AllAddr_Helper_gfx10<bits<8> op> { def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; @@ -2291,7 +2696,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, AtomicNoRet<NAME # "_OFFSET_gfx10", 0>; } -} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" +} // End DecoderNamespace = "GFX10" defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>; @@ -2477,7 +2882,7 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; -defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>; +defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>; defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; @@ -2524,47 +2929,59 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : // MTBUF - GFX11. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in -multiclass MTBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<4> op, string real_name> { - def _BOTHEN_gfx11 : - Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; - def _IDXEN_gfx11 : - Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>; - def _OFFEN_gfx11 : - Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>; - def _OFFSET_gfx11 : - Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>; +multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<bits<4> op, string real_name> { + let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { + def _BOTHEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; + def _IDXEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>; + def _OFFEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>; + def _OFFSET_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>; + } + + let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in { + def _BOTHEN_gfx12 : + VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_BOTHEN"), real_name>; + def _IDXEN_gfx12 : + VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_IDXEN"), real_name>; + def _OFFEN_gfx12 : + VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_OFFEN"), real_name>; + def _OFFSET_gfx12 : + VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_OFFSET"), real_name>; + } } -multiclass MTBUF_Real_AllAddr_gfx11_Impl<bits<4> op, MTBUF_Pseudo ps> - : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>; -multiclass MTBUF_Real_AllAddr_gfx11<bits<4> op> - : MTBUF_Real_AllAddr_gfx11_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Impl<bits<4> op, MTBUF_Pseudo ps> + : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, ps.Mnemonic>; +multiclass MTBUF_Real_AllAddr_gfx11_gfx12<bits<4> op> + : MTBUF_Real_AllAddr_gfx11_gfx12_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; class Pre_gfx11_MTBUF_Name <MTBUF_Pseudo ps, string real_name> : MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>; -multiclass MTBUF_Real_AllAddr_gfx11_Renamed<bits<4> op, string real_name> - : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> { +multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<bits<4> op, string real_name> + : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, real_name> { def : Pre_gfx11_MTBUF_Name<!cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; } -defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x008, "tbuffer_load_d16_format_x">; -defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x009, "tbuffer_load_d16_format_xy">; -defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00a, "tbuffer_load_d16_format_xyz">; -defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">; -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x000>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x001>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x002>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x003>; -defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x00c, "tbuffer_store_d16_format_x">; -defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x00d, "tbuffer_store_d16_format_xy">; -defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00e, "tbuffer_store_d16_format_xyz">; -defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x004>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x005>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x006>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x007>; +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x008, "tbuffer_load_d16_format_x">; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x009, "tbuffer_load_d16_format_xy">; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00a, "tbuffer_load_d16_format_xyz">; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11_gfx12<0x000>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11_gfx12<0x001>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12<0x002>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12<0x003>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00c, "tbuffer_store_d16_format_x">; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00d, "tbuffer_store_d16_format_xy">; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00e, "tbuffer_store_d16_format_xyz">; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11_gfx12<0x004>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11_gfx12<0x005>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12<0x006>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12<0x007>; //===----------------------------------------------------------------------===// // MTBUF - GFX10. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td index 85a3f763cd5a..3a895923fa4b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -12,6 +12,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let LGKM_CNT = 1; let DS = 1; + let GWS = 0; let Size = 8; let UseNamedOperandTable = 1; @@ -61,6 +62,7 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> : let UseNamedOperandTable = 1; // copy relevant pseudo op flags + let GWS = ps.GWS; let SubtargetPredicate = ps.SubtargetPredicate; let OtherPredicates = ps.OtherPredicates; let SchedRW = ps.SchedRW; @@ -376,6 +378,7 @@ multiclass DS_1A_mc <string opName> { class DS_GWS <string opName, dag ins, string asmOps> : DS_Pseudo<opName, (outs), ins, asmOps> { + let GWS = 1; let has_vdst = 0; let has_addr = 0; @@ -708,18 +711,34 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; //===----------------------------------------------------------------------===// -// Instruction definitions for GFX11 and newer. +// Instruction definitions for GFX11. //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGFX11Plus in { +let SubtargetPredicate = isGFX11Only in { def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>; def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; + +} // let SubtargetPredicate = isGFX11Only + +let SubtargetPredicate = isGFX11Plus in { + def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; } // let SubtargetPredicate = isGFX11Plus //===----------------------------------------------------------------------===// +// Instruction definitions for GFX12 and newer. +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGFX12Plus in { + +defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; +defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">; + +} // let SubtargetPredicate = isGFX12Plus + +//===----------------------------------------------------------------------===// // DS Patterns //===----------------------------------------------------------------------===// @@ -803,23 +822,6 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { } } -// Irritatingly, atomic_store reverses the order of operands from a -// normal store. -class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0)) ->; - -multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { - let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>; - } - - let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; - } -} - defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; @@ -829,12 +831,12 @@ foreach vt = Reg32Types.types in { defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">; } -defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">; -defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">; +defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">; +defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">; +defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">; +defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">; +defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">; +defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">; let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>; @@ -969,8 +971,10 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), - /* complexity */ 0, /* gds */ 1>; + let OtherPredicates = [HasGDS] in { + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + } } multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, @@ -989,12 +993,14 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; } - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_"#vt.Size), - /* complexity */ 0, /* gds */ 1>; - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), - /* complexity */ 1, /* gds */ 1>; + let OtherPredicates = [HasGDS] in { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; + } } @@ -1024,10 +1030,12 @@ multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueT /* complexity */ 1>; } - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), - /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), - /* complexity */ 1, /* gds */ 1>; + let OtherPredicates = [HasGDS] in { + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; + } } } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 @@ -1047,10 +1055,12 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), - /* complexity */ 0, /* gds */ 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), - /* complexity */ 1, /* gds */ 1>; + let OtherPredicates = [HasGDS] in { + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; + } } } // End SubtargetPredicate = isGFX11Plus @@ -1175,11 +1185,12 @@ def : GCNPat < //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Base ENC_DS for GFX6, GFX7, GFX10, GFX11. +// Base ENC_DS for GFX6, GFX7, GFX10, GFX11, GFX12. //===----------------------------------------------------------------------===// -class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic> : - DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> { +class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef, + string opName = ps.Mnemonic> + : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> { let Inst{7-0} = !if(ps.has_offset0, offset0, 0); let Inst{15-8} = !if(ps.has_offset1, offset1, 0); @@ -1193,74 +1204,117 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, strin } //===----------------------------------------------------------------------===// +// GFX12. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in { + multiclass DS_Real_gfx12<bits<8> op> { + defvar ps = !cast<DS_Pseudo>(NAME); + def _gfx12 : + Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12, + ps.Mnemonic>; + } + + multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo, + string real_name> { + def _gfx12 : + Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo, + SIEncodingFamily.GFX12, + real_name>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, + Requires<[isGFX12Plus]>; + } +} // End AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" + +defm DS_MIN_NUM_F32 : DS_Real_Renamed_gfx12<0x012, DS_MIN_F32, "ds_min_num_f32">; +defm DS_MAX_NUM_F32 : DS_Real_Renamed_gfx12<0x013, DS_MAX_F32, "ds_max_num_f32">; +defm DS_MIN_NUM_RTN_F32 : DS_Real_Renamed_gfx12<0x032, DS_MIN_RTN_F32, "ds_min_num_rtn_f32">; +defm DS_MAX_NUM_RTN_F32 : DS_Real_Renamed_gfx12<0x033, DS_MAX_RTN_F32, "ds_max_num_rtn_f32">; +defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num_f64">; +defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">; +defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">; +defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">; +defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; +defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>; + +//===----------------------------------------------------------------------===// // GFX11. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in { +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { multiclass DS_Real_gfx11<bits<8> op> { - def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), + def _gfx11 : + Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX11>; } multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> { - def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>, - MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; + def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>; } -} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" - -defm DS_STORE_B32 : DS_Real_Renamed_gfx11<0x00d, DS_WRITE_B32, "ds_store_b32">; -defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">; -defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">; -defm DS_STORE_B8 : DS_Real_Renamed_gfx11<0x01e, DS_WRITE_B8, "ds_store_b8">; -defm DS_STORE_B16 : DS_Real_Renamed_gfx11<0x01f, DS_WRITE_B16, "ds_store_b16">; -defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">; -defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">; -defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">; -defm DS_LOAD_B32 : DS_Real_Renamed_gfx11<0x036, DS_READ_B32, "ds_load_b32">; -defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11<0x037, DS_READ2_B32, "ds_load_2addr_b32">; -defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">; -defm DS_LOAD_I8 : DS_Real_Renamed_gfx11<0x039, DS_READ_I8, "ds_load_i8">; -defm DS_LOAD_U8 : DS_Real_Renamed_gfx11<0x03a, DS_READ_U8, "ds_load_u8">; -defm DS_LOAD_I16 : DS_Real_Renamed_gfx11<0x03b, DS_READ_I16, "ds_load_i16">; -defm DS_LOAD_U16 : DS_Real_Renamed_gfx11<0x03c, DS_READ_U16, "ds_load_u16">; -defm DS_STORE_B64 : DS_Real_Renamed_gfx11<0x04d, DS_WRITE_B64, "ds_store_b64">; -defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">; -defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">; -defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">; -defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">; -defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">; -defm DS_LOAD_B64 : DS_Real_Renamed_gfx11<0x076, DS_READ_B64, "ds_load_b64">; -defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11<0x077, DS_READ2_B64, "ds_load_2addr_b64">; -defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">; -defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">; -defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">; -defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">; -defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">; -defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">; -defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">; -defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">; -defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">; -defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">; -defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">; -defm DS_STORE_B96 : DS_Real_Renamed_gfx11<0x0de, DS_WRITE_B96, "ds_store_b96">; -defm DS_STORE_B128 : DS_Real_Renamed_gfx11<0x0df, DS_WRITE_B128, "ds_store_b128">; -defm DS_LOAD_B96 : DS_Real_Renamed_gfx11<0x0fe, DS_READ_B96, "ds_load_b96">; -defm DS_LOAD_B128 : DS_Real_Renamed_gfx11<0x0ff, DS_READ_B128, "ds_load_b128">; +} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" + +multiclass DS_Real_gfx11_gfx12<bits<8> op> + : DS_Real_gfx11<op>, DS_Real_gfx12<op>; + +multiclass DS_Real_Renamed_gfx11_gfx12<bits<8> op, DS_Pseudo backing_pseudo, + string real_name> + : DS_Real_Renamed_gfx11<op, backing_pseudo, real_name>, + DS_Real_Renamed_gfx12<op, backing_pseudo, real_name>; + +defm DS_STORE_B32 : DS_Real_Renamed_gfx11_gfx12<0x00d, DS_WRITE_B32, "ds_store_b32">; +defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11_gfx12<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">; +defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11_gfx12<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">; +defm DS_STORE_B8 : DS_Real_Renamed_gfx11_gfx12<0x01e, DS_WRITE_B8, "ds_store_b8">; +defm DS_STORE_B16 : DS_Real_Renamed_gfx11_gfx12<0x01f, DS_WRITE_B16, "ds_store_b16">; +defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">; +defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">; +defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">; +defm DS_LOAD_B32 : DS_Real_Renamed_gfx11_gfx12<0x036, DS_READ_B32, "ds_load_b32">; +defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11_gfx12<0x037, DS_READ2_B32, "ds_load_2addr_b32">; +defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11_gfx12<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">; +defm DS_LOAD_I8 : DS_Real_Renamed_gfx11_gfx12<0x039, DS_READ_I8, "ds_load_i8">; +defm DS_LOAD_U8 : DS_Real_Renamed_gfx11_gfx12<0x03a, DS_READ_U8, "ds_load_u8">; +defm DS_LOAD_I16 : DS_Real_Renamed_gfx11_gfx12<0x03b, DS_READ_I16, "ds_load_i16">; +defm DS_LOAD_U16 : DS_Real_Renamed_gfx11_gfx12<0x03c, DS_READ_U16, "ds_load_u16">; +defm DS_STORE_B64 : DS_Real_Renamed_gfx11_gfx12<0x04d, DS_WRITE_B64, "ds_store_b64">; +defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11_gfx12<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">; +defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11_gfx12<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">; +defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">; +defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">; +defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">; +defm DS_LOAD_B64 : DS_Real_Renamed_gfx11_gfx12<0x076, DS_READ_B64, "ds_load_b64">; +defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11_gfx12<0x077, DS_READ2_B64, "ds_load_2addr_b64">; +defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11_gfx12<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">; +defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">; +defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">; +defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">; +defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">; +defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">; +defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">; +defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">; +defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">; +defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11_gfx12<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">; +defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11_gfx12<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">; +defm DS_STORE_B96 : DS_Real_Renamed_gfx11_gfx12<0x0de, DS_WRITE_B96, "ds_store_b96">; +defm DS_STORE_B128 : DS_Real_Renamed_gfx11_gfx12<0x0df, DS_WRITE_B128, "ds_store_b128">; +defm DS_LOAD_B96 : DS_Real_Renamed_gfx11_gfx12<0x0fe, DS_READ_B96, "ds_load_b96">; +defm DS_LOAD_B128 : DS_Real_Renamed_gfx11_gfx12<0x0ff, DS_READ_B128, "ds_load_b128">; // DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped // comparing to pre-GFX11. // Note: the mnemonic alias is not generated to avoid a potential ambiguity due to the semantics change. -defm DS_CMPSTORE_B32 : DS_Real_gfx11<0x010>; +defm DS_CMPSTORE_B32 : DS_Real_gfx11_gfx12<0x010>; defm DS_CMPSTORE_F32 : DS_Real_gfx11<0x011>; -defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11<0x030>; +defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11_gfx12<0x030>; defm DS_CMPSTORE_RTN_F32 : DS_Real_gfx11<0x031>; -defm DS_CMPSTORE_B64 : DS_Real_gfx11<0x050>; +defm DS_CMPSTORE_B64 : DS_Real_gfx11_gfx12<0x050>; defm DS_CMPSTORE_F64 : DS_Real_gfx11<0x051>; -defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11<0x070>; +defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11_gfx12<0x070>; defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>; -defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>; +defm DS_ADD_RTN_F32 : DS_Real_gfx11_gfx12<0x079>; defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>; defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>; defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>; @@ -1271,8 +1325,8 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>; let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass DS_Real_gfx10<bits<8> op> { - def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), - SIEncodingFamily.GFX10>; + def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, + !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>; } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" @@ -1289,28 +1343,34 @@ defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>; defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>; //===----------------------------------------------------------------------===// -// GFX10, GFX11. +// GFX10, GFX11, GFX12. //===----------------------------------------------------------------------===// +multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op> : + DS_Real_gfx10<op>, DS_Real_gfx11<op>, DS_Real_gfx12<op>; + multiclass DS_Real_gfx10_gfx11<bits<8> op> : DS_Real_gfx10<op>, DS_Real_gfx11<op>; -defm DS_ADD_F32 : DS_Real_gfx10_gfx11<0x015>; +defm DS_ADD_F32 : DS_Real_gfx10_gfx11_gfx12<0x015>; defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; -defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b2>; -defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b3>; +defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2>; +defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3>; //===----------------------------------------------------------------------===// -// GFX7, GFX10, GFX11. +// GFX7, GFX10, GFX11, GFX12. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { multiclass DS_Real_gfx7<bits<8> op> { - def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), - SIEncodingFamily.SI>; + def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, + !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" +multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op> : + DS_Real_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>; + multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> : DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>; @@ -1320,7 +1380,7 @@ multiclass DS_Real_gfx7_gfx10<bits<8> op> : // FIXME-GFX7: Add tests when upstreaming this part. defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>; defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>; -defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11<0x07e>; +defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11_gfx12<0x07e>; defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>; defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>; defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>; @@ -1332,30 +1392,33 @@ defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>; let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { multiclass DS_Real_gfx6_gfx7<bits<8> op> { - def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), - SIEncodingFamily.SI>; + def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, + !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" +multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> : + DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>; + multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> : DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>; multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> : DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>; -defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x000>; -defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x001>; -defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x002>; -defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x003>; -defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x004>; -defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x005>; -defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x006>; -defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x007>; -defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x008>; -defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x009>; -defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00a>; -defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00b>; -defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00c>; +defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>; +defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x001>; +defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>; +defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x003>; +defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x004>; +defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x005>; +defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x006>; +defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x007>; +defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x008>; +defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x009>; +defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00a>; +defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00b>; +defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00c>; defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>; defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>; @@ -1365,7 +1428,7 @@ defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>; defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>; defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>; -defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11<0x014>; +defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>; defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>; defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>; defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>; @@ -1375,19 +1438,19 @@ defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>; defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>; defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>; -defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x020>; -defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x021>; -defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x022>; -defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x023>; -defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x024>; -defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x025>; -defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x026>; -defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x027>; -defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x028>; -defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x029>; -defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02a>; -defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; -defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02c>; +defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x020>; +defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x021>; +defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x022>; +defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x023>; +defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x024>; +defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x025>; +defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x026>; +defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x027>; +defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x028>; +defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x029>; +defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02a>; +defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>; +defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02c>; defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>; defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>; @@ -1397,7 +1460,7 @@ defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>; defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>; defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>; -defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x035>; +defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035>; defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>; defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>; @@ -1407,22 +1470,22 @@ defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>; defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>; defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>; -defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03d>; -defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03e>; +defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d>; +defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e>; defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>; -defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x040>; -defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x041>; -defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x042>; -defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x043>; -defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x044>; -defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x045>; -defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x046>; -defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x047>; -defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x048>; -defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x049>; -defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04a>; -defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04b>; -defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04c>; +defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x040>; +defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x041>; +defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x042>; +defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x043>; +defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x044>; +defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x045>; +defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x046>; +defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x047>; +defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x048>; +defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x049>; +defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04a>; +defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04b>; +defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04c>; defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>; defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>; @@ -1432,19 +1495,19 @@ defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>; defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x052>; defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x053>; -defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x060>; -defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x061>; -defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x062>; -defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x063>; -defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x064>; -defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x065>; -defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x066>; -defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x067>; -defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x068>; -defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x069>; -defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06a>; -defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06b>; -defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06c>; +defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x060>; +defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x061>; +defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x062>; +defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x063>; +defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x064>; +defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x065>; +defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x066>; +defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x067>; +defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x068>; +defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x069>; +defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06a>; +defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06b>; +defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06c>; defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>; defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 1b05acd5c90a..ed2e7e4f189e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -91,9 +91,11 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); int64_t Offset; - if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets. + if (DAsm->isGFX12Plus()) { // GFX12 supports 24-bit signed offsets. + Offset = SignExtend64<24>(Imm); + } else if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets. Offset = Imm & 0xFFFFF; - } else { // GFX9+ supports 21-bit signed offsets. + } else { // GFX9+ supports 21-bit signed offsets. Offset = SignExtend64<21>(Imm); } return addOperand(Inst, MCOperand::createImm(Offset)); @@ -105,6 +107,13 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, return addOperand(Inst, DAsm->decodeBoolReg(Val)); } +static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, + uint64_t Addr, + const MCDisassembler *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); +} + #define DECODE_OPERAND(StaticDecoderName, DecoderName) \ static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ uint64_t /*Addr*/, \ @@ -200,10 +209,12 @@ DECODE_OPERAND_REG_8(VReg_512) DECODE_OPERAND_REG_8(VReg_1024) DECODE_OPERAND_REG_7(SReg_32, OPW32) +DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32) DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32) DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32) DECODE_OPERAND_REG_7(SReg_64, OPW64) DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64) +DECODE_OPERAND_REG_7(SReg_96, OPW96) DECODE_OPERAND_REG_7(SReg_128, OPW128) DECODE_OPERAND_REG_7(SReg_256, OPW256) DECODE_OPERAND_REG_7(SReg_512, OPW512) @@ -238,6 +249,7 @@ DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128) DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16) DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16) @@ -259,6 +271,62 @@ DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32) DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16) DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16) DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32) + +static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm, + uint64_t /*Addr*/, + const MCDisassembler *Decoder) { + assert(isUInt<10>(Imm) && "10-bit encoding expected"); + assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used"); + + bool IsHi = Imm & (1 << 9); + unsigned RegIdx = Imm & 0xff; + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); +} + +static DecodeStatus +DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/, + const MCDisassembler *Decoder) { + assert(isUInt<8>(Imm) && "8-bit encoding expected"); + + bool IsHi = Imm & (1 << 7); + unsigned RegIdx = Imm & 0x7f; + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); +} + +static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm, + uint64_t /*Addr*/, + const MCDisassembler *Decoder) { + assert(isUInt<9>(Imm) && "9-bit encoding expected"); + + const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + bool IsVGPR = Imm & (1 << 8); + if (IsVGPR) { + bool IsHi = Imm & (1 << 7); + unsigned RegIdx = Imm & 0x7f; + return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); + } + return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16, + Imm & 0xFF, false, 16)); +} + +static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm, + uint64_t /*Addr*/, + const MCDisassembler *Decoder) { + assert(isUInt<10>(Imm) && "10-bit encoding expected"); + + const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + bool IsVGPR = Imm & (1 << 8); + if (IsVGPR) { + bool IsHi = Imm & (1 << 9); + unsigned RegIdx = Imm & 0xff; + return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); + } + return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16, + Imm & 0xFF, false, 16)); +} static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -321,6 +389,15 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm, return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256)); } +static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, + uint64_t Addr, + const MCDisassembler *Decoder) { + assert(Imm < (1 << 9) && "9-bit encoding"); + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand( + Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true)); +} + static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { @@ -371,18 +448,19 @@ DECODE_SDWA(VopcDst) template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) { assert(Bytes.size() >= sizeof(T)); - const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data()); + const auto Res = + support::endian::read<T, llvm::endianness::little>(Bytes.data()); Bytes = Bytes.slice(sizeof(T)); return Res; } static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) { assert(Bytes.size() >= 12); - uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>( - Bytes.data()); + uint64_t Lo = + support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data()); Bytes = Bytes.slice(8); - uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>( - Bytes.data()); + uint64_t Hi = + support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data()); Bytes = Bytes.slice(4); return DecoderUInt128(Lo, Hi); } @@ -418,25 +496,48 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); - Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS); + Res = + tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696, + MI, DecW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS); - if (Res) { - if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + Res = + tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696, + MI, DecW, Address, CS); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + MI = MCInst(); // clear + + const auto convertVOPDPP = [&]() { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); - else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) + } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) { convertVOPCDPPInst(MI); // Special VOP3 case - else { + } else { assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); convertVOP3DPPInst(MI); // Regular VOP3 case } + }; + Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696, + MI, DecW, Address, CS); + if (Res) { + convertVOPDPP(); + break; + } + Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696, + MI, DecW, Address, CS); + if (Res) { + convertVOPDPP(); break; } Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); + if (Res) + break; } // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); @@ -461,7 +562,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, break; MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS); + Res = tryDecodeInst(DecoderTableDPP8GFX1164, + DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + MI = MCInst(); // clear + + Res = tryDecodeInst(DecoderTableDPP8GFX1264, + DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; MI = MCInst(); // clear @@ -469,7 +577,16 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS); + Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664, + MI, QW, Address, CS); + if (Res) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) + convertVOPCDPPInst(MI); + break; + } + + Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664, + MI, QW, Address, CS); if (Res) { if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) convertVOPCDPPInst(MI); @@ -530,9 +647,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS); + Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW, + Address, CS); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW, + Address, CS); + if (Res) + break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; @@ -560,7 +683,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS); + Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, + Address, CS); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, + Address, CS); if (Res) break; @@ -640,6 +769,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = convertMIMGInst(MI); } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))) + Res = convertMIMGInst(MI); + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)) Res = convertEXPInst(MI); @@ -679,7 +812,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { - if (STI.hasFeature(AMDGPU::FeatureGFX11)) { + if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) { // The MCInst still has these fields even though they are no longer encoded // in the GFX11 instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm); @@ -690,9 +823,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 || MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 || - MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) { + MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) { // The MCInst has this field that is not directly encoded in the // instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); @@ -840,6 +977,7 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { // VADDR size. Consequently, decoded instructions always show address as if it // has 1 dword, which could be not really so. DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { + auto TSFlags = MCII->get(MI.getOpcode()).TSFlags; int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst); @@ -848,8 +986,9 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AMDGPU::OpName::vdata); int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); - int RsrcIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); + int RsrcOpName = TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc + : AMDGPU::OpName::rsrc; + int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); @@ -870,7 +1009,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { } bool IsAtomic = (VDstIdx != -1); - bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; + bool IsGather4 = TSFlags & SIInstrFlags::Gather4; + bool IsVSample = TSFlags & SIInstrFlags::VSAMPLE; bool IsNSA = false; bool IsPartialNSA = false; unsigned AddrSize = Info->VAddrDwords; @@ -887,10 +1027,13 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI)); + // VSAMPLE insts that do not use vaddr3 behave the same as NSA forms. + // VIMAGE insts other than BVH never use vaddr4. IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA || - Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA; + Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA || + Info->MIMGEncoding == AMDGPU::MIMGEncGfx12; if (!IsNSA) { - if (AddrSize > 12) + if (!IsVSample && AddrSize > 12) AddrSize = 16; } else { if (AddrSize > Info->VAddrDwords) { @@ -1098,6 +1241,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, case AMDGPU::TTMP_64RegClassID: shift = 1; break; + case AMDGPU::SGPR_96RegClassID: + case AMDGPU::TTMP_96RegClassID: case AMDGPU::SGPR_128RegClassID: case AMDGPU::TTMP_128RegClassID: // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in @@ -1132,6 +1277,13 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, return createRegOperand(SRegClassID, Val >> shift); } +MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx, + bool IsHi) const { + unsigned RCID = + IsHi ? AMDGPU::VGPR_HI16RegClassID : AMDGPU::VGPR_LO16RegClassID; + return createRegOperand(RCID, RegIdx); +} + // Decode Literals for insts which always have a literal in the encoding MCOperand AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { @@ -1147,7 +1299,7 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { return MCOperand::createImm(Literal); } -MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { +MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants @@ -1157,9 +1309,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { Twine(Bytes.size())); } HasLiteral = true; - Literal = eatBytes<uint32_t>(Bytes); + Literal = Literal64 = eatBytes<uint32_t>(Bytes); + if (ExtendFP64) + Literal64 <<= 32; } - return MCOperand::createImm(Literal); + return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { @@ -1376,7 +1530,7 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, bool MandatoryLiteral, - unsigned ImmWidth) const { + unsigned ImmWidth, bool IsFP) const { using namespace AMDGPU::EncValues; assert(Val < 1024); // enum10 @@ -1388,6 +1542,20 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, return createRegOperand(IsAGPR ? getAgprClassId(Width) : getVgprClassId(Width), Val - VGPR_MIN); } + return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth, + IsFP); +} + +MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width, + unsigned Val, + bool MandatoryLiteral, + unsigned ImmWidth, + bool IsFP) const { + // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been + // decoded earlier. + assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0"); + using namespace AMDGPU::EncValues; + if (Val <= SGPR_MAX) { // "SGPR_MIN <= Val" is always true and causes compilation warning. static_assert(SGPR_MIN == 0); @@ -1410,7 +1578,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, // Keep a sentinel value for deferred setting return MCOperand::createImm(LITERAL_CONST); else - return decodeLiteralConstant(); + return decodeLiteralConstant(IsFP && ImmWidth == 64); } switch (Width) { @@ -1590,6 +1758,10 @@ MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { : decodeSrcOp(OPW32, Val); } +MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + bool AMDGPUDisassembler::isVI() const { return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } @@ -1616,11 +1788,18 @@ bool AMDGPUDisassembler::isGFX11Plus() const { return AMDGPU::isGFX11Plus(STI); } +bool AMDGPUDisassembler::isGFX12Plus() const { + return AMDGPU::isGFX12Plus(STI); +} bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch); } +bool AMDGPUDisassembler::hasKernargPreload() const { + return AMDGPU::hasKernargPreload(STI); +} + //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// @@ -1704,12 +1883,16 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV) return MCDisassembler::Fail; - PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); + if (!isGFX12Plus()) + PRINT_DIRECTIVE(".amdhsa_dx10_clamp", + COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP); if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE) return MCDisassembler::Fail; - PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + if (!isGFX12Plus()) + PRINT_DIRECTIVE(".amdhsa_ieee_mode", + COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE); if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY) return MCDisassembler::Fail; @@ -1717,17 +1900,29 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER) return MCDisassembler::Fail; - PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL); + if (isGFX9Plus()) + PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL); - if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0) + if (!isGFX9Plus()) + if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0) + return MCDisassembler::Fail; + if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED1) return MCDisassembler::Fail; + if (!isGFX10Plus()) + if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2) + return MCDisassembler::Fail; if (isGFX10Plus()) { PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode", - COMPUTE_PGM_RSRC1_WGP_MODE); - PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED); - PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS); + COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE); + PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED); + PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS); } + + if (isGFX12Plus()) + PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling", + COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN); + return MCDisassembler::Success; } @@ -1807,16 +2002,29 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( PRINT_PSEUDO_DIRECTIVE_COMMENT( "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); } - PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE", - COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE); - PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START", - COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); - PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END", - COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END); - if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0) + + if (isGFX11Plus()) { + PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE", + COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START", + COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END", + COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END); + } else { + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0) + return MCDisassembler::Fail; + } + + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1) return MCDisassembler::Fail; - PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP", - COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); + + if (isGFX11Plus()) { + PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP", + COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START); + } else { + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2) + return MCDisassembler::Fail; + } } else if (FourByteBuffer) { return MCDisassembler::Fail; } @@ -1945,10 +2153,24 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( return MCDisassembler::Success; - case amdhsa::RESERVED2_OFFSET: - // 6 bytes from here are reserved, must be 0. - ReservedBytes = DE.getBytes(Cursor, 6); - for (int I = 0; I < 6; ++I) { + case amdhsa::KERNARG_PRELOAD_OFFSET: + using namespace amdhsa; + TwoByteBuffer = DE.getU16(Cursor); + if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) { + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length", + KERNARG_PRELOAD_SPEC_LENGTH); + } + + if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) { + PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset", + KERNARG_PRELOAD_SPEC_OFFSET); + } + return MCDisassembler::Success; + + case amdhsa::RESERVED3_OFFSET: + // 4 bytes from here are reserved, must be 0. + ReservedBytes = DE.getBytes(Cursor, 4); + for (int I = 0; I < 4; ++I) { if (ReservedBytes[I] != 0) return MCDisassembler::Fail; } @@ -1975,7 +2197,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor( if (isGFX10Plus()) { uint16_t KernelCodeProperties = support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET], - support::endianness::little); + llvm::endianness::little); EnableWavefrontSize32 = AMDHSA_BITS_GET(KernelCodeProperties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); @@ -2018,7 +2240,7 @@ AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, // Code Object V3 kernel descriptors. StringRef Name = Symbol.Name; - if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) { + if (Symbol.Type == ELF::STT_OBJECT && Name.ends_with(StringRef(".kd"))) { Size = 64; // Size = 64 regardless of success or failure. return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 444312473a5f..233581949d71 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -97,6 +97,7 @@ private: const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; + mutable uint64_t Literal64; mutable bool HasLiteral; mutable std::optional<bool> EnableWavefrontSize32; @@ -114,6 +115,7 @@ public: MCOperand createRegOperand(unsigned int RegId) const; MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; + MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const; MCOperand errOperand(unsigned V, const Twine& ErrMsg) const; @@ -144,6 +146,17 @@ public: return MCDisassembler::Fail; } + template <typename InsnType> + DecodeStatus tryDecodeInst(const uint8_t *Table1, const uint8_t *Table2, + MCInst &MI, InsnType Inst, uint64_t Address, + raw_ostream &Comments) const { + for (const uint8_t *T : {Table1, Table2}) { + if (DecodeStatus Res = tryDecodeInst(T, MI, Inst, Address, Comments)) + return Res; + } + return MCDisassembler::Fail; + } + std::optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const override; @@ -217,11 +230,15 @@ public: static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm); MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const; - MCOperand decodeLiteralConstant() const; + MCOperand decodeLiteralConstant(bool ExtendFP64) const; MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val, - bool MandatoryLiteral = false, - unsigned ImmWidth = 0) const; + bool MandatoryLiteral = false, unsigned ImmWidth = 0, + bool IsFP = false) const; + + MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val, + bool MandatoryLiteral = false, + unsigned ImmWidth = 0, bool IsFP = false) const; MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; @@ -234,6 +251,7 @@ public: MCOperand decodeSDWAVopcDst(unsigned Val) const; MCOperand decodeBoolReg(unsigned Val) const; + MCOperand decodeSplitBarrier(unsigned Val) const; int getTTmpIdx(unsigned Val) const; @@ -247,8 +265,10 @@ public: bool isGFX10Plus() const; bool isGFX11() const; bool isGFX11Plus() const; + bool isGFX12Plus() const; bool hasArchitectedFlatScratch() const; + bool hasKernargPreload() const; bool isMacDPP(MCInst &MI) const; }; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td index 14ba01f0d67c..ff1d661ef6fe 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -41,8 +41,8 @@ class EXP_Real_ComprVM<bit done, string pseudo, int subtarget> } // Real instruction with optional asm operand "row_en". -class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget> - : EXPCommon<row, done, "exp$tgt $src0, $src1, $src2, $src3" +class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget, string name = "exp"> + : EXPCommon<row, done, name#"$tgt $src0, $src1, $src2, $src3" #!if(done, " done", "")#!if(row, " row_en", "")>, SIMCInstr<pseudo, subtarget> { let AsmMatchConverter = "cvtExp"; @@ -105,12 +105,12 @@ def EXP_gfx10 : EXP_Real_gfx10<0, "EXP">; def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">; //===----------------------------------------------------------------------===// -// GFX11+ +// GFX11 //===----------------------------------------------------------------------===// class EXP_Real_gfx11<bit _row, bit _done, string pseudo> : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX11>, EXPe_Row { - let AssemblerPredicate = isGFX11Plus; + let AssemblerPredicate = isGFX11Only; let DecoderNamespace = "GFX11"; let row = _row; let done = _done; @@ -122,6 +122,24 @@ def EXP_ROW_gfx11 : EXP_Real_gfx11<1, 0, "EXP_ROW">; def EXP_ROW_DONE_gfx11 : EXP_Real_gfx11<1, 1, "EXP_ROW_DONE">; //===----------------------------------------------------------------------===// +// GFX12+ +//===----------------------------------------------------------------------===// + +class VEXPORT_Real_gfx12<bit _row, bit _done, string pseudo> + : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX12, "export">, + EXPe_Row, MnemonicAlias<"exp", "export">, Requires<[isGFX12Plus]> { + let AssemblerPredicate = isGFX12Plus; + let DecoderNamespace = "GFX12"; + let row = _row; + let done = _done; +} + +def EXPORT_gfx12 : VEXPORT_Real_gfx12<0, 0, "EXP">; +def EXPORT_DONE_gfx12 : VEXPORT_Real_gfx12<0, 1, "EXP_DONE">; +def EXPORT_ROW_gfx12 : VEXPORT_Real_gfx12<1, 0, "EXP_ROW">; +def EXPORT_ROW_DONE_gfx12 : VEXPORT_Real_gfx12<1, 1, "EXP_ROW_DONE">; + +//===----------------------------------------------------------------------===// // EXP Patterns //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td index 5c86d80e7dd2..0dd2b3f5c2c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -144,6 +144,47 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?); } +class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>, + Enc96 { + + let FLAT = 1; + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let OtherPredicates = ps.OtherPredicates; + let TSFlags = ps.TSFlags; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let SchedRW = ps.SchedRW; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let IsAtomicRet = ps.IsAtomicRet; + let IsAtomicNoRet = ps.IsAtomicNoRet; + let VM_CNT = ps.VM_CNT; + let LGKM_CNT = ps.LGKM_CNT; + let VALU = ps.VALU; + + bits<7> saddr; + bits<8> vdst; + bits<6> cpol; + bits<8> vdata; // vsrc + bits<8> vaddr; + bits<24> offset; + + let Inst{6-0} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0); + let Inst{21-14} = op; + let Inst{31-26} = 0x3b; + let Inst{39-32} = !if(ps.has_vdst, vdst, ?); + let Inst{49} = ps.sve; + let Inst{54-53} = cpol{2-1}; // th{2-1} + let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} + let Inst{51-50} = cpol{4-3}; // scope + let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?); + let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?); + let Inst{95-72} = offset; +} + class GlobalSaddrTable <bit is_saddr, string Name = ""> { bit IsSaddr = is_saddr; string SaddrOp = Name; @@ -758,6 +799,10 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst +let SubtargetPredicate = isGFX12Plus in { + defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>; +} // End SubtargetPredicate = isGFX12Plus + defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -870,9 +915,10 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", VReg_64, i64>; -let SubtargetPredicate = HasGFX10_BEncoding in -defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", - VGPR_32, i32>; +let SubtargetPredicate = HasGFX10_BEncoding in { + defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo <"global_atomic_csub", + VGPR_32, i32>; +} defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">; defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">; @@ -996,12 +1042,6 @@ class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; -class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt> : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data), - (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) ->; - class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)), @@ -1024,13 +1064,6 @@ class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; -class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - // atomic store follows atomic binop convention so the address comes - // first. - (node (FlatOffset i64:$vaddr, i32:$offset), vt:$data), - (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) ->; - class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < // atomic store follows atomic binop convention so the address comes @@ -1039,19 +1072,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; -multiclass FlatAtomicPat <string inst, string node, ValueType vt, - ValueType data_vt = vt> { - defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size); - defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); - - def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; +multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> { + defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size)); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } +multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> { + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size)); + + def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), + (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; +} + +multiclass FlatAtomicPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> : + FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>, + FlatAtomicNoRtnPat<inst, node, vt, data_vt, isIntr>; + +multiclass FlatAtomicIntrNoRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>; +} + +multiclass FlatAtomicIntrRtnPat <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>; +} + +multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt, + ValueType data_vt = vt> : + FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>, + FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>; + class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), @@ -1174,12 +1231,12 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; -def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; -def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; +def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; +def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1269,24 +1326,13 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, } } -// Deal with swapped operands for atomic_store vs. regular store -multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { - def : FlatStoreSignedAtomicPat <inst, node, vt> { - let AddedComplexity = 10; - } - - def : GlobalAtomicStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { - let AddedComplexity = 11; - } -} - multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt, ValueType data_vt = vt> { let AddedComplexity = 11 in - def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<PatFrags>(node), vt, data_vt>; + def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>; let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<PatFrags>(node), vt, data_vt>; + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>; } multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt, @@ -1444,12 +1490,12 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16> defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; } -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; +defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; +defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; +defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; +defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -1466,6 +1512,9 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; +let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; + defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_load_uinc_wrap_global", i64>; @@ -1483,10 +1532,14 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let OtherPredicates = [isGFX10Plus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +} + +let OtherPredicates = [isGFX10GFX11] in { +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } @@ -1502,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; } +let OtherPredicates = [isGFX12Only] in { + defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; + defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; + defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; +} + let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; @@ -1998,7 +2058,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> { multiclass FLAT_Real_ST_gfx10<bits<7> op> { def _ST_gfx10 : FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> { - let Inst{54-48} = !cast<int>(EXEC_HI.HWEncoding); + let Inst{54-48} = EXEC_HI.Index; let OtherPredicates = [HasFlatScratchSTMode]; } } @@ -2126,7 +2186,7 @@ defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>; defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>; defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>; -defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>; +defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_gfx10<0x034>; defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>; defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>; defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>; @@ -2201,7 +2261,7 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>; class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : FLAT_Real <op, ps, opName>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> { - let AssemblerPredicate = isGFX11Plus; + let AssemblerPredicate = isGFX11Only; let DecoderNamespace = "GFX11"; let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue); @@ -2213,19 +2273,19 @@ class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> { if renamed then - def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>; + def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Only]>; } multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> : FLAT_Aliases_gfx11<ps, opName, renamed> { def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> { - let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + let Inst{54-48} = SGPR_NULL_gfx11plus.Index; } } multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> { def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> { - let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + let Inst{54-48} = SGPR_NULL_gfx11plus.Index; } } @@ -2239,7 +2299,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> { multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> { def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> { - let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + let Inst{54-48} = SGPR_NULL_gfx11plus.Index; let OtherPredicates = [HasFlatScratchSTMode]; } } @@ -2357,7 +2417,7 @@ defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATO defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>; defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>; defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>; -defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>; +defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>; defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>; defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>; defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>; @@ -2408,3 +2468,213 @@ defm SCRATCH_LOAD_D16_HI_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_ defm SCRATCH_LOAD_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">; defm SCRATCH_STORE_D16_HI_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">; defm SCRATCH_STORE_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">; + +//===----------------------------------------------------------------------===// +// GFX12 +//===----------------------------------------------------------------------===// + +class VFLAT_Real_gfx12 <bits<8> op, FLAT_Pseudo ps, + string opName = ps.Mnemonic> : + VFLAT_Real <op, ps, opName>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX12> { + let AssemblerPredicate = isGFX12Plus; + let DecoderNamespace = "GFX12"; + + let Inst{25-24} = !if(ps.is_flat_scratch, 0b01, + !if(ps.is_flat_global, 0b10, 0b00)); +} + +multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string alias> { + if renamed then + def _renamed_gfx12 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX12Plus]>; + if !not(!empty(alias)) then + def _alias_gfx12 : MnemonicAlias<alias, opName>, Requires<[isGFX12Plus]>; +} + +multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : + VFLAT_Aliases_gfx12<ps, opName, renamed, alias> { + def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> { + let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + } +} + +multiclass VFLAT_Real_RTN_gfx12<bits<8> op, string ps, string opName> { + def _RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> { + let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + } +} + +multiclass VFLAT_Real_SADDR_gfx12<bits<8> op, string ps, string opName> { + def _SADDR_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>; +} + +multiclass VFLAT_Real_SADDR_RTN_gfx12<bits<8> op, string ps, string opName> { + def _SADDR_RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>; +} + +multiclass VFLAT_Real_ST_gfx12<bits<8> op, string ps, string opName> { + def _ST_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> { + let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + let OtherPredicates = [HasFlatScratchSTMode]; + } +} + +multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> { + def _SVS_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> { + let OtherPredicates = [HasFlatScratchSVSMode]; + } +} + +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : + VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>, + VFLAT_Real_RTN_gfx12<op, ps, opName>; + +multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : + VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>, + VFLAT_Real_SADDR_gfx12<op, ps, opName>; + +multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> : + VGLOBAL_Real_AllAddr_gfx12<op, ps, opName, renamed, alias>, + VFLAT_Real_RTN_gfx12<op, ps, opName>, + VFLAT_Real_SADDR_RTN_gfx12<op, ps, opName>; + +multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false> : + VFLAT_Real_Base_gfx12<op, ps, opName, renamed>, + VFLAT_Real_SADDR_gfx12<op, ps, opName>, + VFLAT_Real_ST_gfx12<op, ps, opName>, + VFLAT_Real_SVS_gfx12<op, ps, opName>; + +// ENC_VFLAT. +defm FLAT_LOAD_U8 : VFLAT_Real_Base_gfx12<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>; +defm FLAT_LOAD_I8 : VFLAT_Real_Base_gfx12<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>; +defm FLAT_LOAD_U16 : VFLAT_Real_Base_gfx12<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>; +defm FLAT_LOAD_I16 : VFLAT_Real_Base_gfx12<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>; +defm FLAT_LOAD_B32 : VFLAT_Real_Base_gfx12<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>; +defm FLAT_LOAD_B64 : VFLAT_Real_Base_gfx12<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>; +defm FLAT_LOAD_B96 : VFLAT_Real_Base_gfx12<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>; +defm FLAT_LOAD_B128 : VFLAT_Real_Base_gfx12<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>; +defm FLAT_STORE_B8 : VFLAT_Real_Base_gfx12<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>; +defm FLAT_STORE_B16 : VFLAT_Real_Base_gfx12<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>; +defm FLAT_STORE_B32 : VFLAT_Real_Base_gfx12<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>; +defm FLAT_STORE_B64 : VFLAT_Real_Base_gfx12<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>; +defm FLAT_STORE_B96 : VFLAT_Real_Base_gfx12<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>; +defm FLAT_STORE_B128 : VFLAT_Real_Base_gfx12<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>; +defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">; +defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">; +defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">; +defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">; +defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">; +defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">; +defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">; +defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">; +defm FLAT_ATOMIC_SWAP_B32 : VFLAT_Real_Atomics_gfx12<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>; +defm FLAT_ATOMIC_CMPSWAP_B32 : VFLAT_Real_Atomics_gfx12<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>; +defm FLAT_ATOMIC_ADD_U32 : VFLAT_Real_Atomics_gfx12<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>; +defm FLAT_ATOMIC_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>; +defm FLAT_ATOMIC_SUB_CLAMP_U32 : VFLAT_Real_Atomics_gfx12<0x037, "FLAT_ATOMIC_CSUB_U32", "flat_atomic_sub_clamp_u32", true>; +defm FLAT_ATOMIC_MIN_I32 : VFLAT_Real_Atomics_gfx12<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>; +defm FLAT_ATOMIC_MIN_U32 : VFLAT_Real_Atomics_gfx12<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>; +defm FLAT_ATOMIC_MAX_I32 : VFLAT_Real_Atomics_gfx12<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>; +defm FLAT_ATOMIC_MAX_U32 : VFLAT_Real_Atomics_gfx12<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>; +defm FLAT_ATOMIC_AND_B32 : VFLAT_Real_Atomics_gfx12<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>; +defm FLAT_ATOMIC_OR_B32 : VFLAT_Real_Atomics_gfx12<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>; +defm FLAT_ATOMIC_XOR_B32 : VFLAT_Real_Atomics_gfx12<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>; +defm FLAT_ATOMIC_INC_U32 : VFLAT_Real_Atomics_gfx12<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>; +defm FLAT_ATOMIC_DEC_U32 : VFLAT_Real_Atomics_gfx12<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>; +defm FLAT_ATOMIC_SWAP_B64 : VFLAT_Real_Atomics_gfx12<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>; +defm FLAT_ATOMIC_CMPSWAP_B64 : VFLAT_Real_Atomics_gfx12<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>; +defm FLAT_ATOMIC_ADD_U64 : VFLAT_Real_Atomics_gfx12<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>; +defm FLAT_ATOMIC_SUB_U64 : VFLAT_Real_Atomics_gfx12<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>; +defm FLAT_ATOMIC_MIN_I64 : VFLAT_Real_Atomics_gfx12<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>; +defm FLAT_ATOMIC_MIN_U64 : VFLAT_Real_Atomics_gfx12<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>; +defm FLAT_ATOMIC_MAX_I64 : VFLAT_Real_Atomics_gfx12<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>; +defm FLAT_ATOMIC_MAX_U64 : VFLAT_Real_Atomics_gfx12<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>; +defm FLAT_ATOMIC_AND_B64 : VFLAT_Real_Atomics_gfx12<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>; +defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>; +defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>; +defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>; +defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>; +defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">; +defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">; +defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; + +// ENC_VGLOBAL. +defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>; +defm GLOBAL_LOAD_I8 : VGLOBAL_Real_AllAddr_gfx12<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>; +defm GLOBAL_LOAD_U16 : VGLOBAL_Real_AllAddr_gfx12<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>; +defm GLOBAL_LOAD_I16 : VGLOBAL_Real_AllAddr_gfx12<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>; +defm GLOBAL_LOAD_B32 : VGLOBAL_Real_AllAddr_gfx12<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>; +defm GLOBAL_LOAD_B64 : VGLOBAL_Real_AllAddr_gfx12<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>; +defm GLOBAL_LOAD_B96 : VGLOBAL_Real_AllAddr_gfx12<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>; +defm GLOBAL_LOAD_B128 : VGLOBAL_Real_AllAddr_gfx12<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>; +defm GLOBAL_STORE_B8 : VGLOBAL_Real_AllAddr_gfx12<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>; +defm GLOBAL_STORE_B16 : VGLOBAL_Real_AllAddr_gfx12<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>; +defm GLOBAL_STORE_B32 : VGLOBAL_Real_AllAddr_gfx12<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>; +defm GLOBAL_STORE_B64 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>; +defm GLOBAL_STORE_B96 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>; +defm GLOBAL_STORE_B128 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>; +defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">; +defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">; +defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">; +defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">; +defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">; +defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">; +defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">; + +defm GLOBAL_ATOMIC_SWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>; +defm GLOBAL_ATOMIC_CMPSWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>; +defm GLOBAL_ATOMIC_ADD_U32 : VGLOBAL_Real_Atomics_gfx12<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>; +defm GLOBAL_ATOMIC_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>; +defm GLOBAL_ATOMIC_SUB_CLAMP_U32 : VGLOBAL_Real_Atomics_gfx12<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_sub_clamp_u32", true, "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_MIN_I32 : VGLOBAL_Real_Atomics_gfx12<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>; +defm GLOBAL_ATOMIC_MIN_U32 : VGLOBAL_Real_Atomics_gfx12<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>; +defm GLOBAL_ATOMIC_MAX_I32 : VGLOBAL_Real_Atomics_gfx12<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>; +defm GLOBAL_ATOMIC_MAX_U32 : VGLOBAL_Real_Atomics_gfx12<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>; +defm GLOBAL_ATOMIC_AND_B32 : VGLOBAL_Real_Atomics_gfx12<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>; +defm GLOBAL_ATOMIC_OR_B32 : VGLOBAL_Real_Atomics_gfx12<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>; +defm GLOBAL_ATOMIC_XOR_B32 : VGLOBAL_Real_Atomics_gfx12<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>; +defm GLOBAL_ATOMIC_INC_U32 : VGLOBAL_Real_Atomics_gfx12<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>; +defm GLOBAL_ATOMIC_DEC_U32 : VGLOBAL_Real_Atomics_gfx12<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>; +defm GLOBAL_ATOMIC_SWAP_B64 : VGLOBAL_Real_Atomics_gfx12<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>; +defm GLOBAL_ATOMIC_CMPSWAP_B64 : VGLOBAL_Real_Atomics_gfx12<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>; +defm GLOBAL_ATOMIC_ADD_U64 : VGLOBAL_Real_Atomics_gfx12<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>; +defm GLOBAL_ATOMIC_SUB_U64 : VGLOBAL_Real_Atomics_gfx12<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>; +defm GLOBAL_ATOMIC_MIN_I64 : VGLOBAL_Real_Atomics_gfx12<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>; +defm GLOBAL_ATOMIC_MIN_U64 : VGLOBAL_Real_Atomics_gfx12<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>; +defm GLOBAL_ATOMIC_MAX_I64 : VGLOBAL_Real_Atomics_gfx12<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>; +defm GLOBAL_ATOMIC_MAX_U64 : VGLOBAL_Real_Atomics_gfx12<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>; +defm GLOBAL_ATOMIC_AND_B64 : VGLOBAL_Real_Atomics_gfx12<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>; +defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>; +defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>; +defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>; +defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>; +defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; + +// ENC_VSCRATCH. +defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; +defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>; +defm SCRATCH_LOAD_U16 : VSCRATCH_Real_AllAddr_gfx12<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>; +defm SCRATCH_LOAD_I16 : VSCRATCH_Real_AllAddr_gfx12<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>; +defm SCRATCH_LOAD_B32 : VSCRATCH_Real_AllAddr_gfx12<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>; +defm SCRATCH_LOAD_B64 : VSCRATCH_Real_AllAddr_gfx12<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>; +defm SCRATCH_LOAD_B96 : VSCRATCH_Real_AllAddr_gfx12<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>; +defm SCRATCH_LOAD_B128 : VSCRATCH_Real_AllAddr_gfx12<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>; +defm SCRATCH_STORE_B8 : VSCRATCH_Real_AllAddr_gfx12<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>; +defm SCRATCH_STORE_B16 : VSCRATCH_Real_AllAddr_gfx12<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>; +defm SCRATCH_STORE_B32 : VSCRATCH_Real_AllAddr_gfx12<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>; +defm SCRATCH_STORE_B64 : VSCRATCH_Real_AllAddr_gfx12<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>; +defm SCRATCH_STORE_B96 : VSCRATCH_Real_AllAddr_gfx12<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>; +defm SCRATCH_STORE_B128 : VSCRATCH_Real_AllAddr_gfx12<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>; +defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">; +defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">; +defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">; +defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">; +defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">; +defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">; +defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">; +defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index c9e0c6849568..05e10a95b157 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -25,7 +25,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" @@ -72,8 +71,11 @@ public: auto *SecondMI = CI.SecondMI; unsigned Opc1 = FirstMI->getOpcode(); unsigned Opc2 = SecondMI->getOpcode(); - int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), - AMDGPU::getVOPDOpcode(Opc2)); + unsigned EncodingFamily = + AMDGPU::getVOPDEncodingFamily(SII->getSubtarget()); + int NewOpcode = + AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2), EncodingFamily); assert(NewOpcode != -1 && "Should have previously determined this as a possible VOPD\n"); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 2592584b89c6..a75082268c77 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { return &OldOpnd; } +[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, + MachineRegisterInfo &MRI) { + int16_t RegClass = MI.getDesc().operands()[Idx].RegClass; + if (RegClass == -1) + return 0; + + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass)); +} + MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR, @@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); assert(Src0); + int Src0Idx = NumOperands; if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); Fail = true; @@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); if (Src1) { - if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + int OpNum = NumOperands; + // If subtarget does not support SGPRs for src1 operand then the + // requirements are the same as for src0. We check src0 instead because + // pseudos are shared between subtargets and allow SGPR for src1 on all. + if (!ST->hasDPPSrc1SGPR()) { + assert(getOperandSize(*DPPInst, Src0Idx, *MRI) == + getOperandSize(*DPPInst, NumOperands, *MRI) && + "Src0 and Src1 operands should have the same size"); + OpNum = Src0Idx; + } + if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; break; @@ -505,7 +526,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); assert(DppCtrl && DppCtrl->isImm()); - if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) { + if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) { LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" " control value\n"); // Let it split, then control may become legal. @@ -728,7 +749,7 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { ++NumDPPMovsCombined; } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { - if (ST->has64BitDPP() && combineDPPMov(MI)) { + if (ST->hasDPALU_DPP() && combineDPPMov(MI)) { Changed = true; ++NumDPPMovsCombined; } else { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 2d53b2a70dbe..a7d8ff0242b8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -163,7 +163,9 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, static bool isPermlane(const MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || - Opcode == AMDGPU::V_PERMLANEX16_B32_e64; + Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || + Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || + Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; } static bool isLdsDma(const MachineInstr &MI) { @@ -271,7 +273,7 @@ GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); assert(TSchedModel.getWriteProcResBegin(SC) != TSchedModel.getWriteProcResEnd(SC)); - return TSchedModel.getWriteProcResBegin(SC)->Cycles; + return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; } void GCNHazardRecognizer::processBundle() { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index d89c9b1febde..cdc9de7f65e3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -251,7 +251,7 @@ GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin, assert(UPTracker.isValid() || (dbgs() << "Tracked region ", printRegion(dbgs(), Begin, End, LIS), false)); - return UPTracker.moveMaxPressure(); + return UPTracker.getMaxPressureAndReset(); } // returns max pressure for a tentative schedule @@ -272,7 +272,7 @@ GCNIterativeScheduler::getSchedulePressure(const Region &R, for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) { RPTracker.recede(*getMachineInstr(*--I)); } - return RPTracker.moveMaxPressure(); + return RPTracker.getMaxPressureAndReset(); } void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overridden diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td index b9c9358f88b9..96af1a6aab3d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -9,11 +9,11 @@ // The code produced for "generic" is only useful for tests and cannot // reasonably be expected to execute on any particular target. def : ProcessorModel<"generic", NoSchedModel, - [FeatureWavefrontSize64] + [FeatureWavefrontSize64, FeatureGDS, FeatureGWS] >; def : ProcessorModel<"generic-hsa", NoSchedModel, - [FeatureWavefrontSize64, FeatureFlatAddressSpace] + [FeatureWavefrontSize64, FeatureGDS, FeatureGWS, FeatureFlatAddressSpace] >; //===------------------------------------------------------------===// @@ -279,3 +279,15 @@ def : ProcessorModel<"gfx1150", GFX11SpeedModel, def : ProcessorModel<"gfx1151", GFX11SpeedModel, FeatureISAVersion11_5_1.Features >; + +//===----------------------------------------------------------------------===// +// GCN GFX12. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1200", GFX12SpeedModel, + FeatureISAVersion12.Features +>; + +def : ProcessorModel<"gfx1201", GFX12SpeedModel, + FeatureISAVersion12.Features +>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 68cf97170369..fd8f0bebd3be 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "GCNRegPressure.h" +#include "AMDGPU.h" #include "llvm/CodeGen/RegisterPressure.h" using namespace llvm; @@ -31,7 +32,6 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1, return true; } - /////////////////////////////////////////////////////////////////////////////// // GCNRegPressure @@ -78,7 +78,9 @@ void GCNRegPressure::inc(unsigned Reg, if (PrevMask.none()) { assert(NewMask.any()); - Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + Value[Kind] += + Sign * TRI->getRegClassWeight(MRI.getRegClass(Reg)).RegWeight; } break; @@ -133,8 +135,6 @@ bool GCNRegPressure::less(const GCNSubtarget &ST, O.getVGPRNum(ST.hasGFX90AInsts())); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) { return Printable([&RP, ST](raw_ostream &OS) { OS << "VGPRs: " << RP.Value[GCNRegPressure::VGPR32] << ' ' @@ -153,7 +153,6 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) { OS << '\n'; }); } -#endif static LaneBitmask getDefRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { @@ -167,66 +166,60 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO, MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); } -static LaneBitmask getUsedRegMask(const MachineOperand &MO, - const MachineRegisterInfo &MRI, - const LiveIntervals &LIS) { - assert(MO.isUse() && MO.isReg() && MO.getReg().isVirtual()); - - if (auto SubReg = MO.getSubReg()) - return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); - - auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); - if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs - return MaxMask; - - // For a tentative schedule LIS isn't updated yet but livemask should remain - // the same on any schedule. Subreg defs can be reordered but they all must - // dominate uses anyway. - auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); - return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); -} - -static SmallVector<RegisterMaskPair, 8> -collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, +static void +collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs, + const MachineInstr &MI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { - SmallVector<RegisterMaskPair, 8> Res; + SlotIndex InstrSI; for (const auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; if (!MO.isUse() || !MO.readsReg()) continue; - auto const UsedMask = getUsedRegMask(MO, MRI, LIS); + Register Reg = MO.getReg(); + if (llvm::any_of(RegMaskPairs, [Reg](const RegisterMaskPair &RM) { + return RM.RegUnit == Reg; + })) + continue; - auto Reg = MO.getReg(); - auto I = llvm::find_if( - Res, [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; }); - if (I != Res.end()) - I->LaneMask |= UsedMask; - else - Res.push_back(RegisterMaskPair(Reg, UsedMask)); + LaneBitmask UseMask; + auto &LI = LIS.getInterval(Reg); + if (!LI.hasSubRanges()) + UseMask = MRI.getMaxLaneMaskForVReg(Reg); + else { + // For a tentative schedule LIS isn't updated yet but livemask should + // remain the same on any schedule. Subreg defs can be reordered but they + // all must dominate uses anyway. + if (!InstrSI) + InstrSI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + UseMask = getLiveLaneMask(LI, InstrSI, MRI); + } + + RegMaskPairs.emplace_back(Reg, UseMask); } - return Res; } /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker -LaneBitmask llvm::getLiveLaneMask(unsigned Reg, - SlotIndex SI, +LaneBitmask llvm::getLiveLaneMask(unsigned Reg, SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { + return getLiveLaneMask(LIS.getInterval(Reg), SI, MRI); +} + +LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, + const MachineRegisterInfo &MRI) { LaneBitmask LiveMask; - const auto &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { for (const auto &S : LI.subranges()) if (S.liveAt(SI)) { LiveMask |= S.LaneMask; - assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) || - LiveMask == MRI.getMaxLaneMaskForVReg(Reg)); + assert(LiveMask == (LiveMask & MRI.getMaxLaneMaskForVReg(LI.reg()))); } } else if (LI.liveAt(SI)) { - LiveMask = MRI.getMaxLaneMaskForVReg(Reg); + LiveMask = MRI.getMaxLaneMaskForVReg(LI.reg()); } return LiveMask; } @@ -262,9 +255,15 @@ void GCNRPTracker::reset(const MachineInstr &MI, MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); } -void GCNUpwardRPTracker::reset(const MachineInstr &MI, - const LiveRegSet *LiveRegsCopy) { - GCNRPTracker::reset(MI, LiveRegsCopy, true); +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + +void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_, + const LiveRegSet &LiveRegs_) { + MRI = &MRI_; + LiveRegs = LiveRegs_; + LastTrackedMI = nullptr; + MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); } void GCNUpwardRPTracker::recede(const MachineInstr &MI) { @@ -275,41 +274,61 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { if (MI.isDebugInstr()) return; - auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI); + // Kill all defs. + GCNRegPressure DefPressure, ECDefPressure; + bool HasECDefs = false; + for (const MachineOperand &MO : MI.all_defs()) { + if (!MO.getReg().isVirtual()) + continue; - // calc pressure at the MI (defs + uses) - auto AtMIPressure = CurPressure; - for (const auto &U : RegUses) { - auto LiveMask = LiveRegs[U.RegUnit]; - AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI); - } - // update max pressure - MaxPressure = max(AtMIPressure, MaxPressure); + Register Reg = MO.getReg(); + LaneBitmask DefMask = getDefRegMask(MO, *MRI); - for (const auto &MO : MI.all_defs()) { - if (!MO.getReg().isVirtual() || MO.isDead()) - continue; + // Treat a def as fully live at the moment of definition: keep a record. + if (MO.isEarlyClobber()) { + ECDefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI); + HasECDefs = true; + } else + DefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI); - auto Reg = MO.getReg(); auto I = LiveRegs.find(Reg); if (I == LiveRegs.end()) continue; - auto &LiveMask = I->second; - auto PrevMask = LiveMask; - LiveMask &= ~getDefRegMask(MO, *MRI); + + LaneBitmask &LiveMask = I->second; + LaneBitmask PrevMask = LiveMask; + LiveMask &= ~DefMask; CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); if (LiveMask.none()) LiveRegs.erase(I); } - for (const auto &U : RegUses) { - auto &LiveMask = LiveRegs[U.RegUnit]; - auto PrevMask = LiveMask; + + // Update MaxPressure with defs pressure. + DefPressure += CurPressure; + if (HasECDefs) + DefPressure += ECDefPressure; + MaxPressure = max(DefPressure, MaxPressure); + + // Make uses alive. + SmallVector<RegisterMaskPair, 8> RegUses; + collectVirtualRegUses(RegUses, MI, LIS, *MRI); + for (const RegisterMaskPair &U : RegUses) { + LaneBitmask &LiveMask = LiveRegs[U.RegUnit]; + LaneBitmask PrevMask = LiveMask; LiveMask |= U.LaneMask; CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); } + + // Update MaxPressure with uses plus early-clobber defs pressure. + MaxPressure = HasECDefs ? max(CurPressure + ECDefPressure, MaxPressure) + : max(CurPressure, MaxPressure); + assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } +//////////////////////////////////////////////////////////////////////////////// +// GCNDownwardRPTracker + bool GCNDownwardRPTracker::reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy) { MRI = &MI.getParent()->getParent()->getRegInfo(); @@ -416,19 +435,17 @@ bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin, return advance(End); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, const GCNRPTracker::LiveRegSet &TrackedLR, - const TargetRegisterInfo *TRI) { - return Printable([&LISLR, &TrackedLR, TRI](raw_ostream &OS) { + const TargetRegisterInfo *TRI, StringRef Pfx) { + return Printable([&LISLR, &TrackedLR, TRI, Pfx](raw_ostream &OS) { for (auto const &P : TrackedLR) { auto I = LISLR.find(P.first); if (I == LISLR.end()) { - OS << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) + OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) << " isn't found in LIS reported set\n"; } else if (I->second != P.second) { - OS << " " << printReg(P.first, TRI) + OS << Pfx << printReg(P.first, TRI) << " masks doesn't match: LIS reported " << PrintLaneMask(I->second) << ", tracked " << PrintLaneMask(P.second) << '\n'; } @@ -436,7 +453,7 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, for (auto const &P : LISLR) { auto I = TrackedLR.find(P.first); if (I == TrackedLR.end()) { - OS << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) + OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) << " isn't found in tracked set\n"; } } @@ -465,7 +482,6 @@ bool GCNUpwardRPTracker::isValid() const { return true; } -LLVM_DUMP_METHOD Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs, const MachineRegisterInfo &MRI) { return Printable([&LiveRegs, &MRI](raw_ostream &OS) { @@ -481,7 +497,163 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs, }); } -LLVM_DUMP_METHOD void GCNRegPressure::dump() const { dbgs() << print(*this); } -#endif +static cl::opt<bool> UseDownwardTracker( + "amdgpu-print-rp-downward", + cl::desc("Use GCNDownwardRPTracker for GCNRegPressurePrinter pass"), + cl::init(false), cl::Hidden); + +char llvm::GCNRegPressurePrinter::ID = 0; +char &llvm::GCNRegPressurePrinterID = GCNRegPressurePrinter::ID; + +INITIALIZE_PASS(GCNRegPressurePrinter, "amdgpu-print-rp", "", true, true) + +// Return lanemask of Reg's subregs that are live-through at [Begin, End] and +// are fully covered by Mask. +static LaneBitmask +getRegLiveThroughMask(const MachineRegisterInfo &MRI, const LiveIntervals &LIS, + Register Reg, SlotIndex Begin, SlotIndex End, + LaneBitmask Mask = LaneBitmask::getAll()) { + + auto IsInOneSegment = [Begin, End](const LiveRange &LR) -> bool { + auto *Segment = LR.getSegmentContaining(Begin); + return Segment && Segment->contains(End); + }; + + LaneBitmask LiveThroughMask; + const LiveInterval &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + for (auto &SR : LI.subranges()) { + if ((SR.LaneMask & Mask) == SR.LaneMask && IsInOneSegment(SR)) + LiveThroughMask |= SR.LaneMask; + } + } else { + LaneBitmask RegMask = MRI.getMaxLaneMaskForVReg(Reg); + if ((RegMask & Mask) == RegMask && IsInOneSegment(LI)) + LiveThroughMask = RegMask; + } + + return LiveThroughMask; +} + +bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const LiveIntervals &LIS = getAnalysis<LiveIntervals>(); + + auto &OS = dbgs(); + +// Leading spaces are important for YAML syntax. +#define PFX " " + + OS << "---\nname: " << MF.getName() << "\nbody: |\n"; + + auto printRP = [](const GCNRegPressure &RP) { + return Printable([&RP](raw_ostream &OS) { + OS << format(PFX " %-5d", RP.getSGPRNum()) + << format(" %-5d", RP.getVGPRNum(false)); + }); + }; + + auto ReportLISMismatchIfAny = [&](const GCNRPTracker::LiveRegSet &TrackedLR, + const GCNRPTracker::LiveRegSet &LISLR) { + if (LISLR != TrackedLR) { + OS << PFX " mis LIS: " << llvm::print(LISLR, MRI) + << reportMismatch(LISLR, TrackedLR, TRI, PFX " "); + } + }; + + // Register pressure before and at an instruction (in program order). + SmallVector<std::pair<GCNRegPressure, GCNRegPressure>, 16> RP; + + for (auto &MBB : MF) { + RP.clear(); + RP.reserve(MBB.size()); + + OS << PFX; + MBB.printName(OS); + OS << ":\n"; + + SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB); + SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB); + + GCNRPTracker::LiveRegSet LiveIn, LiveOut; + GCNRegPressure RPAtMBBEnd; + + if (UseDownwardTracker) { + if (MBB.empty()) { + LiveIn = LiveOut = getLiveRegs(MBBStartSlot, LIS, MRI); + RPAtMBBEnd = getRegPressure(MRI, LiveIn); + } else { + GCNDownwardRPTracker RPT(LIS); + RPT.reset(MBB.front()); + + LiveIn = RPT.getLiveRegs(); + + while (!RPT.advanceBeforeNext()) { + GCNRegPressure RPBeforeMI = RPT.getPressure(); + RPT.advanceToNext(); + RP.emplace_back(RPBeforeMI, RPT.getPressure()); + } + + LiveOut = RPT.getLiveRegs(); + RPAtMBBEnd = RPT.getPressure(); + } + } else { + GCNUpwardRPTracker RPT(LIS); + RPT.reset(MRI, MBBEndSlot); + + LiveOut = RPT.getLiveRegs(); + RPAtMBBEnd = RPT.getPressure(); + + for (auto &MI : reverse(MBB)) { + RPT.resetMaxPressure(); + RPT.recede(MI); + if (!MI.isDebugInstr()) + RP.emplace_back(RPT.getPressure(), RPT.getMaxPressure()); + } + + LiveIn = RPT.getLiveRegs(); + } + + OS << PFX " Live-in: " << llvm::print(LiveIn, MRI); + if (!UseDownwardTracker) + ReportLISMismatchIfAny(LiveIn, getLiveRegs(MBBStartSlot, LIS, MRI)); + + OS << PFX " SGPR VGPR\n"; + int I = 0; + for (auto &MI : MBB) { + if (!MI.isDebugInstr()) { + auto &[RPBeforeInstr, RPAtInstr] = + RP[UseDownwardTracker ? I : (RP.size() - 1 - I)]; + ++I; + OS << printRP(RPBeforeInstr) << '\n' << printRP(RPAtInstr) << " "; + } else + OS << PFX " "; + MI.print(OS); + } + OS << printRP(RPAtMBBEnd) << '\n'; + + OS << PFX " Live-out:" << llvm::print(LiveOut, MRI); + if (UseDownwardTracker) + ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI)); + + GCNRPTracker::LiveRegSet LiveThrough; + for (auto [Reg, Mask] : LiveIn) { + LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg); + if (MaskIntersection.any()) { + LaneBitmask LTMask = getRegLiveThroughMask( + MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection); + if (LTMask.any()) + LiveThrough[Reg] = LTMask; + } + } + OS << PFX " Live-thr:" << llvm::print(LiveThrough, MRI); + OS << printRP(getRegPressure(MRI, LiveThrough)) << '\n'; + } + OS << "...\n"; + return false; + +#undef PFX +}
\ No newline at end of file diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 72e18acc1b8e..4100970fe1a9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -85,6 +85,18 @@ struct GCNRegPressure { return !(*this == O); } + GCNRegPressure &operator+=(const GCNRegPressure &RHS) { + for (unsigned I = 0; I < TOTAL_KINDS; ++I) + Value[I] += RHS.Value[I]; + return *this; + } + + GCNRegPressure &operator-=(const GCNRegPressure &RHS) { + for (unsigned I = 0; I < TOTAL_KINDS; ++I) + Value[I] -= RHS.Value[I]; + return *this; + } + void dump() const; private: @@ -105,6 +117,20 @@ inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { return Res; } +inline GCNRegPressure operator+(const GCNRegPressure &P1, + const GCNRegPressure &P2) { + GCNRegPressure Sum = P1; + Sum += P2; + return Sum; +} + +inline GCNRegPressure operator-(const GCNRegPressure &P1, + const GCNRegPressure &P2) { + GCNRegPressure Diff = P1; + Diff -= P2; + return Diff; +} + class GCNRPTracker { public: using LiveRegSet = DenseMap<unsigned, LaneBitmask>; @@ -128,32 +154,55 @@ public: void clearMaxPressure() { MaxPressure.clear(); } - // returns MaxPressure, resetting it - decltype(MaxPressure) moveMaxPressure() { - auto Res = MaxPressure; - MaxPressure.clear(); - return Res; - } + GCNRegPressure getPressure() const { return CurPressure; } decltype(LiveRegs) moveLiveRegs() { return std::move(LiveRegs); } }; +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + class GCNUpwardRPTracker : public GCNRPTracker { public: GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} - // reset tracker to the point just below MI - // filling live regs upon this point using LIS - void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + // reset tracker and set live register set to the specified value. + void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + + // reset tracker at the specified slot index. + void reset(const MachineRegisterInfo &MRI, SlotIndex SI) { + reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); + } + + // reset tracker to the end of the MBB. + void reset(const MachineBasicBlock &MBB) { + reset(MBB.getParent()->getRegInfo(), + LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); + } + + // reset tracker to the point just after MI (in program order). + void reset(const MachineInstr &MI) { + reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot()); + } - // move to the state just above the MI + // move to the state just before the MI (in program order). void recede(const MachineInstr &MI); // checks whether the tracker's state after receding MI corresponds - // to reported by LIS + // to reported by LIS. bool isValid() const; + + const GCNRegPressure &getMaxPressure() const { return MaxPressure; } + + void resetMaxPressure() { MaxPressure = CurPressure; } + + GCNRegPressure getMaxPressureAndReset() { + GCNRegPressure RP = MaxPressure; + resetMaxPressure(); + return RP; + } }; class GCNDownwardRPTracker : public GCNRPTracker { @@ -167,6 +216,13 @@ public: MachineBasicBlock::const_iterator getNext() const { return NextMI; } + // Return MaxPressure and clear it. + GCNRegPressure moveMaxPressure() { + auto Res = MaxPressure; + MaxPressure.clear(); + return Res; + } + // Reset tracker to the point before the MI // filling live regs upon this point using LIS. // Returns false if block is empty except debug values. @@ -196,8 +252,10 @@ LaneBitmask getLiveLaneMask(unsigned Reg, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); -GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, - const LiveIntervals &LIS, +LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI, + const MachineRegisterInfo &MRI); + +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); /// creates a map MachineInstr -> LiveRegSet @@ -275,7 +333,22 @@ Printable print(const GCNRPTracker::LiveRegSet &LiveRegs, Printable reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, const GCNRPTracker::LiveRegSet &TrackedL, - const TargetRegisterInfo *TRI); + const TargetRegisterInfo *TRI, StringRef Pfx = " "); + +struct GCNRegPressurePrinter : public MachineFunctionPass { + static char ID; + +public: + GCNRegPressurePrinter() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp index 99db7e4af9fd..019b64dd871e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp @@ -101,17 +101,16 @@ private: /// find new regclass such that: /// 1. It has subregs obtained by shifting each OldSubReg by RShift number /// of bits to the right. Every "shifted" subreg should have the same - /// SubRegRC. SubRegRC can be null, in this case it initialized using - /// getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that - /// "covers" all other subregs in pairs. Basically such subreg becomes a - /// whole register. + /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers" + /// all other subregs in pairs. Basically such subreg becomes a whole + /// register. /// 2. Resulting register class contains registers of minimal size but not /// less than RegNumBits. /// /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out /// parameter: /// OldSubReg - input parameter, - /// SubRegRC - in/out, should be changed for unknown regclass, + /// SubRegRC - input parameter (cannot be null), /// NewSubReg - output, contains shifted subregs on return. const TargetRegisterClass * getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, @@ -228,19 +227,7 @@ GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); for (auto &[OldSubReg, SRI] : SubRegs) { auto &[SubRegRC, NewSubReg] = SRI; - - // Register class may be unknown, for example: - // undef %0.sub4:sgpr_1024 = S_MOV_B32 01 - // %0.sub5:sgpr_1024 = S_MOV_B32 02 - // %1:vreg_64 = COPY %0.sub4_sub5 - // Register classes for subregs 'sub4' and 'sub5' are known from the - // description of destination operand of S_MOV_B32 instruction but the - // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction. - if (!SubRegRC) - SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg); - - if (!SubRegRC) - return nullptr; + assert(SubRegRC); LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':' << TRI->getRegClassName(SubRegRC) @@ -248,6 +235,8 @@ GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( << " -> "); if (OldSubReg == CoverSubregIdx) { + // Covering subreg will become a full register, RC should be allocatable. + assert(SubRegRC->isAllocatable()); NewSubReg = AMDGPU::NoSubRegister; LLVM_DEBUG(dbgs() << "whole reg"); } else { @@ -421,33 +410,42 @@ GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { auto Range = MRI->reg_nodbg_operands(Reg); - if (Range.begin() == Range.end()) + if (Range.empty() || any_of(Range, [](MachineOperand &MO) { + return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1] + })) return false; - for (MachineOperand &MO : Range) { - if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit. - return false; - } - auto *RC = MRI->getRegClass(Reg); LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) << ':' << TRI->getRegClassName(RC) << '\n'); - // Collect used subregs and constrained reg classes infered from instruction + // Collect used subregs and their reg classes infered from instruction // operands. SubRegMap SubRegs; - for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { - assert(MO.getSubReg() != AMDGPU::NoSubRegister); - auto *OpDescRC = getOperandRegClass(MO); - const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC); - if (!Inserted && OpDescRC) { - SubRegInfo &SRI = I->second; - SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC; - if (!SRI.RC) { - LLVM_DEBUG(dbgs() << " Couldn't find common target regclass\n"); - return false; + for (MachineOperand &MO : Range) { + const unsigned SubReg = MO.getSubReg(); + assert(SubReg != AMDGPU::NoSubRegister); // Due to [1]. + LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(SubReg) << ':'); + + const auto [I, Inserted] = SubRegs.try_emplace(SubReg); + const TargetRegisterClass *&SubRegRC = I->second.RC; + + if (Inserted) + SubRegRC = TRI->getSubRegisterClass(RC, SubReg); + + if (SubRegRC) { + if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) { + LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & " + << TRI->getRegClassName(OpDescRC) << " = "); + SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC); } } + + if (!SubRegRC) { + LLVM_DEBUG(dbgs() << "couldn't find target regclass\n"); + return false; + } + LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n'); } auto *NewRC = getMinSizeReg(RC, SubRegs); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 994cfea1fd7d..342d518f38bf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -32,12 +32,18 @@ using namespace llvm; -static cl::opt<bool> - DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule", - cl::Hidden, - cl::desc("Disable unclustred high register pressure " - "reduction scheduling stage."), - cl::init(false)); +static cl::opt<bool> DisableUnclusterHighRP( + "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, + cl::desc("Disable unclustered high register pressure " + "reduction scheduling stage."), + cl::init(false)); + +static cl::opt<bool> DisableClusteredLowOccupancy( + "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, + cl::desc("Disable clustered low occupancy " + "rescheduling for ILP scheduling stage."), + cl::init(false)); + static cl::opt<unsigned> ScheduleMetricBias( "amdgpu-schedule-metric-bias", cl::Hidden, cl::desc( @@ -707,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { return false; SavedMutations.swap(DAG.Mutations); - DAG.addMutation(createIGroupLPDAGMutation()); + DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false)); InitialOccupancy = DAG.MinOccupancy; // Aggressivly try to reduce register pressure in the unclustered high RP @@ -727,6 +733,9 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { } bool ClusteredLowOccStage::initGCNSchedStage() { + if (DisableClusteredLowOccupancy) + return false; + if (!GCNSchedStage::initGCNSchedStage()) return false; @@ -844,7 +853,9 @@ bool GCNSchedStage::initGCNRegion() { StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { SavedMutations.clear(); SavedMutations.swap(DAG.Mutations); - DAG.addMutation(createIGroupLPDAGMutation()); + bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || + StageID == GCNSchedStageID::ILPInitialSchedule; + DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage)); } return true; @@ -1116,7 +1127,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { } bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { - // If RP is not reduced in the unclustred reschedule stage, revert to the + // If RP is not reduced in the unclustered reschedule stage, revert to the // old schedule. if ((WavesAfter <= PressureBefore.getOccupancy(ST) && mayCauseSpilling(WavesAfter)) || @@ -1558,7 +1569,7 @@ void GCNPostScheduleDAGMILive::schedule() { if (HasIGLPInstrs) { SavedMutations.clear(); SavedMutations.swap(Mutations); - addMutation(createIGroupLPDAGMutation()); + addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true)); } ScheduleDAGMI::schedule(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ef5470df876d..91a709303269 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/Support/ErrorHandling.h" #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" @@ -77,6 +78,7 @@ protected: bool UnalignedAccessMode = false; bool HasApertureRegs = false; bool SupportsXNACK = false; + bool KernargPreload = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for XNACK. @@ -105,6 +107,7 @@ protected: bool GFX940Insts = false; bool GFX10Insts = false; bool GFX11Insts = false; + bool GFX12Insts = false; bool GFX10_3Insts = false; bool GFX7GFX8GFX9Insts = false; bool SGPRInitBug = false; @@ -116,6 +119,7 @@ protected: bool HasFmaMixInsts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; + bool HasScalarDwordx3Loads = false; bool HasScalarStores = false; bool HasScalarAtomics = false; bool HasSDWAOmod = false; @@ -125,7 +129,8 @@ protected: bool HasSDWAOutModsVOPC = false; bool HasDPP = false; bool HasDPP8 = false; - bool Has64BitDPP = false; + bool HasDPALU_DPP = false; + bool HasDPPSrc1SGPR = false; bool HasPackedFP32Ops = false; bool HasImageInsts = false; bool HasExtendedImageInsts = false; @@ -157,6 +162,7 @@ protected: bool HasAtomicFaddNoRtnInsts = false; bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; bool HasAtomicBufferGlobalPkAddF16Insts = false; + bool HasAtomicCSubNoRtnInsts = false; bool HasAtomicGlobalPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; bool SupportsSRAMECC = false; @@ -180,6 +186,8 @@ protected: bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; bool HasArchitectedSGPRs = false; + bool HasGDS = false; + bool HasGWS = false; bool AddNoCarryInsts = false; bool HasUnpackedD16VMem = false; bool LDSMisalignedBug = false; @@ -188,6 +196,10 @@ protected: bool UnalignedDSAccess = false; bool HasPackedTID = false; bool ScalarizeGlobal = false; + bool HasSALUFloatInsts = false; + bool HasVGPRSingleUseHintInsts = false; + bool HasPseudoScalarTrans = false; + bool HasRestrictedSOffset = false; bool HasVcmpxPermlaneHazard = false; bool HasVMEMtoScalarWriteHazard = false; @@ -201,6 +213,7 @@ protected: bool HasFlatSegmentOffsetBug = false; bool HasImageStoreD16Bug = false; bool HasImageGather4D16Bug = false; + bool HasMSAALoadDstSelBug = false; bool HasGFX11FullVGPRs = false; bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; @@ -667,6 +680,8 @@ public: return AddNoCarryInsts; } + bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } + bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; } @@ -818,6 +833,11 @@ public: bool hasInstPrefetch() const { return getGeneration() >= GFX10; } + bool hasPrefetch() const { return GFX12Insts; } + + // Has s_cmpk_* instructions. + bool hasSCmpK() const { return getGeneration() < GFX12; } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspective of an arbitrary workitem, this // is 4-byte aligned. @@ -853,7 +873,7 @@ public: unsigned NumRegionInstrs) const override; unsigned getMaxNumUserSGPRs() const { - return 16; + return AMDGPU::getMaxNumUserSGPRs(*this); } bool hasSMemRealTime() const { @@ -874,6 +894,8 @@ public: return getGeneration() >= VOLCANIC_ISLANDS; } + bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; } + bool hasScalarStores() const { return HasScalarStores; } @@ -906,14 +928,21 @@ public: return HasDPP8; } - bool has64BitDPP() const { - return Has64BitDPP; + bool hasDPALU_DPP() const { + return HasDPALU_DPP; } + bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; } + bool hasPackedFP32Ops() const { return HasPackedFP32Ops; } + // Has V_PK_MOV_B32 opcode + bool hasPkMovB32() const { + return GFX90AInsts; + } + bool hasFmaakFmamkF32Insts() const { return getGeneration() >= GFX10 || hasGFX940Insts(); } @@ -944,11 +973,15 @@ public: bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } + bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; } + bool hasNSAEncoding() const { return HasNSAEncoding; } bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } - unsigned getNSAMaxSize() const { return AMDGPU::getNSAMaxSize(*this); } + unsigned getNSAMaxSize(bool HasSampler = false) const { + return AMDGPU::getNSAMaxSize(*this, HasSampler); + } bool hasGFX10_AEncoding() const { return GFX10_AEncoding; @@ -1127,6 +1160,14 @@ public: // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } + bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } + + bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; } + + bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } + + bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1155,6 +1196,12 @@ public: /// \returns true if the architected SGPRs are enabled. bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } + /// \returns true if Global Data Share is supported. + bool hasGDS() const { return HasGDS; } + + /// \returns true if Global Wave Sync is supported. + bool hasGWS() const { return HasGWS; } + /// \returns true if the machine has merged shaders in which s0-s7 are /// reserved by the hardware and user SGPRs start at s8 bool hasMergedShaders() const { @@ -1164,6 +1211,37 @@ public: // \returns true if the target supports the pre-NGG legacy geometry path. bool hasLegacyGeometry() const { return getGeneration() < GFX11; } + // \returns true if preloading kernel arguments is supported. + bool hasKernargPreload() const { return KernargPreload; } + + // \returns true if we need to generate backwards compatible code when + // preloading kernel arguments. + bool needsKernargPreloadBackwardsCompatibility() const { + return hasKernargPreload() && !hasGFX940Insts(); + } + + // \returns true if the target has split barriers feature + bool hasSplitBarriers() const { return getGeneration() >= GFX12; } + + // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable. + bool hasCvtFP8VOP1Bug() const { return true; } + + // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a + // no-return form. + bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; } + + // \returns true if the target has DX10_CLAMP kernel descriptor mode bit + bool hasDX10ClampMode() const { return getGeneration() < GFX12; } + + // \returns true if the target has IEEE kernel descriptor mode bit + bool hasIEEEMode() const { return getGeneration() < GFX12; } + + // \returns true if the target has IEEE fminimum/fmaximum instructions + bool hasIEEEMinMax() const { return getGeneration() >= GFX12; } + + // \returns true if the target has WG_RR_MODE kernel descriptor mode bit + bool hasRrWGMode() const { return getGeneration() >= GFX12; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1362,6 +1440,91 @@ public: } }; +class GCNUserSGPRUsageInfo { +public: + bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } + + bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } + + bool hasDispatchPtr() const { return DispatchPtr; } + + bool hasQueuePtr() const { return QueuePtr; } + + bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } + + bool hasDispatchID() const { return DispatchID; } + + bool hasFlatScratchInit() const { return FlatScratchInit; } + + unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } + + unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } + + unsigned getNumFreeUserSGPRs(); + + void allocKernargPreloadSGPRs(unsigned NumSGPRs); + + enum UserSGPRID : unsigned { + ImplicitBufferPtrID = 0, + PrivateSegmentBufferID = 1, + DispatchPtrID = 2, + QueuePtrID = 3, + KernargSegmentPtrID = 4, + DispatchIdID = 5, + FlatScratchInitID = 6, + PrivateSegmentSizeID = 7 + }; + + // Returns the size in number of SGPRs for preload user SGPR field. + static unsigned getNumUserSGPRForField(UserSGPRID ID) { + switch (ID) { + case ImplicitBufferPtrID: + return 2; + case PrivateSegmentBufferID: + return 4; + case DispatchPtrID: + return 2; + case QueuePtrID: + return 2; + case KernargSegmentPtrID: + return 2; + case DispatchIdID: + return 2; + case FlatScratchInitID: + return 2; + case PrivateSegmentSizeID: + return 1; + } + llvm_unreachable("Unknown UserSGPRID."); + } + + GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); + +private: + const GCNSubtarget &ST; + + // Private memory buffer + // Compute directly in sgpr[0:1] + // Other shaders indirect 64-bits at sgpr[0:1] + bool ImplicitBufferPtr = false; + + bool PrivateSegmentBuffer = false; + + bool DispatchPtr = false; + + bool QueuePtr = false; + + bool KernargSegmentPtr = false; + + bool DispatchID = false; + + bool FlatScratchInit = false; + + unsigned NumKernargPreloadSGPRs = 0; + + unsigned NumUsedUserSGPRs = 0; +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 29c9b9ccf276..33c208495c50 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -103,7 +103,13 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, return false; if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) return false; - if (InstInfo.hasInvalidOperand(getVRegIdx)) + + // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache. + bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 && + FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 && + SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32; + + if (InstInfo.hasInvalidOperand(getVRegIdx, SkipSrc)) return false; LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI @@ -142,10 +148,10 @@ namespace { /// be turned into VOPD instructions /// Greedily pairs instruction candidates. O(n^2) algorithm. struct VOPDPairingMutation : ScheduleDAGMutation { - ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer VOPDPairingMutation( - ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer : shouldScheduleAdjacent(shouldScheduleAdjacent) {} void apply(ScheduleDAGInstrs *DAG) override { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index a1f8be403c44..c8ce1903d315 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -13,7 +13,7 @@ #include "AMDGPUCustomBehaviour.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/WithColor.h" @@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction( std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { switch (MCI.getOpcode()) { case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_soft: case AMDGPU::S_WAITCNT_EXPCNT: case AMDGPU::S_WAITCNT_LGKMCNT: case AMDGPU::S_WAITCNT_VMCNT: case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VSCNT_soft: case AMDGPU::S_WAITCNT_EXPCNT_gfx10: case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: case AMDGPU::S_WAITCNT_VMCNT_gfx10: @@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, default: return 0; case AMDGPU::S_WAITCNT: // This instruction + case AMDGPU::S_WAITCNT_soft: case AMDGPU::S_WAITCNT_EXPCNT: case AMDGPU::S_WAITCNT_LGKMCNT: case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. case AMDGPU::S_WAITCNT_EXPCNT_gfx10: case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: case AMDGPU::S_WAITCNT_VMCNT_gfx10: @@ -317,13 +321,15 @@ bool AMDGPUCustomBehaviour::hasModifiersSet( return true; } +// taken from SIInstrInfo::isGWS() +bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { + const MCInstrDesc &MCID = MCII.get(Opcode); + return MCID.TSFlags & SIInstrFlags::GWS; +} + // taken from SIInstrInfo::isAlwaysGDS() bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || - Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || - Opcode == AMDGPU::DS_GWS_SEMA_P || - Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || - Opcode == AMDGPU::DS_GWS_BARRIER; + return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); } } // namespace mca diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index cb1436d319c9..3a231758887b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -68,6 +68,8 @@ class AMDGPUCustomBehaviour : public CustomBehaviour { bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst, unsigned OpName) const; /// Helper function used in generateWaitCntInfo() + bool isGWS(uint16_t Opcode) const; + /// Helper function used in generateWaitCntInfo() bool isAlwaysGDS(uint16_t Opcode) const; /// Helper function used in generateWaitCntInfo() bool isVMEM(const MCInstrDesc &MCID); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 44109b9d2919..f91f36ed851b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -28,7 +28,7 @@ namespace { class AMDGPUAsmBackend : public MCAsmBackend { public: - AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {} + AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {} unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; @@ -53,7 +53,8 @@ public: std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target) override; + const MCValue &Target, + const MCSubtargetInfo *STI) override; }; } //End anonymous namespace @@ -185,12 +186,15 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); return Infos[Kind - FirstTargetFixupKind]; } bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &, const MCFixup &Fixup, - const MCValue &) { + const MCValue &, + const MCSubtargetInfo *STI) { return Fixup.getKind() >= FirstLiteralRelocationKind; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 3f188478ca8b..58eed81e0755 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -63,6 +63,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_REL32_HI; case MCSymbolRefExpr::VK_AMDGPU_REL64: return ELF::R_AMDGPU_REL64; + case MCSymbolRefExpr::VK_AMDGPU_ABS32_LO: + return ELF::R_AMDGPU_ABS32_LO; + case MCSymbolRefExpr::VK_AMDGPU_ABS32_HI: + return ELF::R_AMDGPU_ABS32_HI; } MCFixupKind Kind = Fixup.getKind(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ad55c73b22ea..edc244db613d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -10,13 +10,13 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" -#include "SIRegisterInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/TargetParser/TargetParser.h" @@ -24,12 +24,6 @@ using namespace llvm; using namespace llvm::AMDGPU; -static cl::opt<bool> Keep16BitSuffixes( - "amdgpu-keep-16-bit-reg-suffixes", - cl::desc("Keep .l and .h suffixes in asm for debugging purposes"), - cl::init(false), - cl::ReallyHidden); - void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const { // FIXME: The current implementation of // AsmParser::parseRegisterOrRegisterNumber in MC implies we either emit this @@ -103,28 +97,36 @@ void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); + uint32_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); + + // GFX12 uses a 24-bit signed offset for VBUFFER. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + bool IsVBuffer = Desc.TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF); + if (AMDGPU::isGFX12(STI) && IsVBuffer) + O << formatDec(SignExtend32<24>(Imm)); + else + printU16ImmDecOperand(MI, OpNo, O); } } void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); + uint32_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { O << " offset:"; const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - bool IsFlatSeg = !(Desc.TSFlags & - (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)); + bool AllowNegative = (Desc.TSFlags & (SIInstrFlags::FlatGlobal | + SIInstrFlags::FlatScratch)) || + AMDGPU::isGFX12(STI); - if (IsFlatSeg) // Unsigned offset - printU16ImmDecOperand(MI, OpNo, O); - else // Signed offset + if (AllowNegative) // Signed offset O << formatDec(SignExtend32(Imm, AMDGPU::getNumFlatOffsetBits(STI))); + else // Unsigned offset + printU16ImmDecOperand(MI, OpNo, O); } } @@ -174,6 +176,17 @@ void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { auto Imm = MI->getOperand(OpNo).getImm(); + + if (AMDGPU::isGFX12Plus(STI)) { + const int64_t TH = Imm & CPol::TH; + const int64_t Scope = Imm & CPol::SCOPE; + + printTH(MI, TH, Scope, O); + printScope(Scope, O); + + return; + } + if (Imm & CPol::GLC) O << ((AMDGPU::isGFX940(STI) && !(MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SMRD)) ? " sc0" @@ -188,6 +201,89 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, O << " /* unexpected cache policy bit */"; } +void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope, + raw_ostream &O) { + // For th = 0 do not print this field + if (TH == 0) + return; + + const unsigned Opcode = MI->getOpcode(); + const MCInstrDesc &TID = MII.get(Opcode); + bool IsStore = TID.mayStore(); + bool IsAtomic = + TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet); + + O << " th:"; + + if (IsAtomic) { + O << "TH_ATOMIC_"; + if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) { + if (Scope >= AMDGPU::CPol::SCOPE_DEV) + O << "CASCADE" << (TH & AMDGPU::CPol::TH_ATOMIC_NT ? "_NT" : "_RT"); + else + O << formatHex(TH); + } else if (TH & AMDGPU::CPol::TH_ATOMIC_NT) + O << "NT" << (TH & AMDGPU::CPol::TH_ATOMIC_RETURN ? "_RETURN" : ""); + else if (TH & AMDGPU::CPol::TH_ATOMIC_RETURN) + O << "RETURN"; + else + O << formatHex(TH); + } else { + if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED) + O << formatHex(TH); + else { + // This will default to printing load variants when neither MayStore nor + // MayLoad flag is present which is the case with instructions like + // image_get_resinfo. + O << (IsStore ? "TH_STORE_" : "TH_LOAD_"); + switch (TH) { + case AMDGPU::CPol::TH_NT: + O << "NT"; + break; + case AMDGPU::CPol::TH_HT: + O << "HT"; + break; + case AMDGPU::CPol::TH_BYPASS: // or LU or RT_WB + O << (Scope == AMDGPU::CPol::SCOPE_SYS ? "BYPASS" + : (IsStore ? "RT_WB" : "LU")); + break; + case AMDGPU::CPol::TH_NT_RT: + O << "NT_RT"; + break; + case AMDGPU::CPol::TH_RT_NT: + O << "RT_NT"; + break; + case AMDGPU::CPol::TH_NT_HT: + O << "NT_HT"; + break; + case AMDGPU::CPol::TH_NT_WB: + O << "NT_WB"; + break; + default: + llvm_unreachable("unexpected th value"); + } + } + } +} + +void AMDGPUInstPrinter::printScope(int64_t Scope, raw_ostream &O) { + if (Scope == CPol::SCOPE_CU) + return; + + O << " scope:"; + + if (Scope == CPol::SCOPE_SE) + O << "SCOPE_SE"; + else if (Scope == CPol::SCOPE_DEV) + O << "SCOPE_DEV"; + else if (Scope == CPol::SCOPE_SYS) + O << "SCOPE_SYS"; + else + llvm_unreachable("unexpected scope policy value"); + + return; +} + void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { @@ -278,12 +374,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } #endif - StringRef RegName(getRegisterName(RegNo)); - if (!Keep16BitSuffixes) - if (!RegName.consume_back(".l")) - RegName.consume_back(".h"); - - O << RegName; + O << getRegisterName(RegNo); } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -333,6 +424,15 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx12: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx12: printDefaultVccOperand(false, STI, O); break; } @@ -437,7 +537,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O) { + raw_ostream &O, bool IsFP) { int64_t SImm = static_cast<int64_t>(Imm); if (SImm >= -16 && SImm <= 64) { O << SImm; @@ -465,7 +565,10 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, else if (Imm == 0x3fc45f306dc9c882 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494309189532"; - else { + else if (IsFP) { + assert(AMDGPU::isValid32BitLiteral(Imm, true)); + O << formatHex(static_cast<uint64_t>(Hi_32(Imm))); + } else { assert(isUInt<32>(Imm) || isInt<32>(Imm)); // In rare situations, we will have a 32-bit literal in a 64-bit @@ -532,21 +635,15 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand, void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - uint8_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << " wait_vdst:"; - printU4ImmDecOperand(MI, OpNo, O); - } + O << " wait_vdst:"; + printU4ImmDecOperand(MI, OpNo, O); } void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - uint8_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << " wait_exp:"; - printU4ImmDecOperand(MI, OpNo, O); - } + O << " wait_exp:"; + printU4ImmDecOperand(MI, OpNo, O); } bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc, @@ -619,14 +716,17 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case MCOI::OPERAND_IMMEDIATE: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: printImmediate32(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: + printImmediate64(Op.getImm(), STI, O, false); + break; + case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - printImmediate64(Op.getImm(), STI, O); + printImmediate64(Op.getImm(), STI, O, true); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: @@ -688,7 +788,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, if (RCBits == 32) printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O); else if (RCBits == 64) - printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O); + printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true); else llvm_unreachable("Invalid register class size"); } @@ -725,6 +825,18 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_CNDMASK_B32_e32_gfx12: + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx12: + case AMDGPU::V_CNDMASK_B32_dpp_gfx12: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx12: + case AMDGPU::V_CNDMASK_B32_dpp8_gfx12: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx12: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx12: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx12: case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7: case AMDGPU::V_CNDMASK_B32_e32_vi: @@ -846,13 +958,9 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, unsigned Imm = MI->getOperand(OpNo).getImm(); const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - if (Src0Idx >= 0 && - Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID && - !AMDGPU::isLegal64BitDPPControl(Imm)) { - O << " /* 64 bit dpp only supports row_newbcast */"; + if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) { + O << " /* DP ALU dpp only supports row_newbcast */"; return; } else if (Imm <= DppCtrl::QUAD_PERM_LAST) { O << "quad_perm:["; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 3b14faab136b..95c26de6299e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -66,6 +66,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printCPol(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printTH(const MCInst *MI, int64_t TH, int64_t Scope, raw_ostream &O); + void printScope(int64_t Scope, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -91,7 +93,7 @@ private: void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); + raw_ostream &O, bool IsFP); void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRegularOperand(const MCInst *MI, unsigned OpNo, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 5e77a8caa04e..b403d69d9ff1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -49,6 +49,14 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + void getMachineOpValueT16(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void getMachineOpValueT16Lo128(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + /// Use a fixup to encode the simm16 field for SOPP branch /// instructions. void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, @@ -254,6 +262,7 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return getLit32Encoding(static_cast<uint32_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: @@ -345,7 +354,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // However, dst is encoded as EXEC for compatibility with SP3. if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) { assert((Encoding & 0xFF) == 0); - Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO); + Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) & + AMDGPU::HWEncoding::REG_IDX_MASK; } for (unsigned i = 0; i < bytes; i++) { @@ -403,7 +413,10 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); - support::endian::write<uint32_t>(CB, Imm, support::endianness::little); + if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64) + Imm = Hi_32(Imm); + + support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little); // Only one literal value allowed break; @@ -488,11 +501,14 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding( const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { unsigned Reg = MI.getOperand(OpNo).getReg(); - uint64_t Enc = MRI.getEncodingValue(Reg); + unsigned Enc = MRI.getEncodingValue(Reg); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + bool IsVGPROrAGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR; // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma // instructions use acc[0:1] modifier bits to distinguish. These bits are // encoded as a virtual 9th bit of the register for these operands. + bool IsAGPR = false; if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) || @@ -507,9 +523,9 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding( MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) - Enc |= 512; + IsAGPR = true; - Op = Enc; + Op = Idx | (IsVGPROrAGPR << 8) | (IsAGPR << 9); } static bool needsPCRel(const MCExpr *Expr) { @@ -540,13 +556,38 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { if (MO.isReg()){ - Op = MRI.getEncodingValue(MO.getReg()); + unsigned Enc = MRI.getEncodingValue(MO.getReg()); + unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK; + bool IsVGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR; + Op = Idx | (IsVGPR << 8); return; } unsigned OpNo = &MO - MI.begin(); getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); } +void AMDGPUMCCodeEmitter::getMachineOpValueT16( + const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + llvm_unreachable("TODO: Implement getMachineOpValueT16()."); +} + +void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128( + const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg()) { + uint16_t Encoding = MRI.getEncodingValue(MO.getReg()); + unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK; + bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI; + bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR; + assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!"); + Op = (IsVGPR ? 0x100 : 0) | (IsHi ? 0x80 : 0) | RegIdx; + return; + } + getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); +} + void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 1bd3cdc67800..a855cf585205 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -66,8 +66,8 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { AMDGPU::GPUKind AK; + // clang-format off switch (ElfMach) { - default: llvm_unreachable("Unhandled ELF::EF_AMDGPU type"); case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; @@ -126,8 +126,12 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + default: AK = GK_NONE; break; } + // clang-format on StringRef GPUName = getArchNameAMDGCN(AK); if (GPUName != "") @@ -140,6 +144,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { if (AK == AMDGPU::GPUKind::GK_NONE) AK = parseArchR600(GPU); + // clang-format off switch (AK) { case GK_R600: return ELF::EF_AMDGPU_MACH_R600_R600; case GK_R630: return ELF::EF_AMDGPU_MACH_R600_R630; @@ -199,8 +204,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150; case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; + case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; + case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } + // clang-format on llvm_unreachable("unknown GPU"); } @@ -368,6 +376,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + if (hasKernargPreload(STI)) { + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD, + kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET); + } PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); @@ -418,9 +432,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( switch (CodeObjectVersion) { default: break; - case AMDGPU::AMDHSA_COV2: - break; - case AMDGPU::AMDHSA_COV3: case AMDGPU::AMDHSA_COV4: case AMDGPU::AMDHSA_COV5: if (getTargetID()->isXnackSupported()) @@ -440,16 +451,16 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); - PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); - PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + if (IVersion.Major < 12) { + PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP); + PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE); + } if (IVersion.Major >= 9) PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL); if (AMDGPU::isGFX90A(STI)) PRINT_FIELD(OS, ".amdhsa_tg_split", KD, compute_pgm_rsrc3, @@ -457,16 +468,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( if (IVersion.Major >= 10) { PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE); PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED); PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS); PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3, amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); } + if (IVersion.Major >= 12) + PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN); PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, @@ -539,7 +553,7 @@ void AMDGPUTargetELFStreamer::EmitNote( unsigned NoteFlags = 0; // TODO Apparently, this is currently needed for OpenCL as mentioned in // https://reviews.llvm.org/D74995 - if (STI.getTargetTriple().getOS() == Triple::AMDHSA) + if (isHsaAbi(STI)) NoteFlags = ELF::SHF_ALLOC; S.pushSection(); @@ -598,11 +612,10 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() { } unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() { - assert(STI.getTargetTriple().getOS() == Triple::AMDHSA); + assert(isHsaAbi(STI)); if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { switch (*HsaAbiVer) { - case ELF::ELFABIVERSION_AMDGPU_HSA_V2: case ELF::ELFABIVERSION_AMDGPU_HSA_V3: return getEFlagsV3(); case ELF::ELFABIVERSION_AMDGPU_HSA_V4: @@ -827,6 +840,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( return true; } +bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + for (int i = 0; i < 64; ++i) { + OS << "\ts_nop 0\n"; + } + return true; +} + +bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + const uint32_t Encoded_s_nop = 0xbf800000; + MCStreamer &OS = getStreamer(); + for (int i = 0; i < 64; ++i) { + OS.emitInt32(Encoded_s_nop); + } + return true; +} + bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; const uint32_t Encoded_s_nop = 0xbf800000; @@ -906,6 +937,7 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1); Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2); Streamer.emitInt16(KernelDescriptor.kernel_code_properties); - for (uint8_t Res : KernelDescriptor.reserved2) + Streamer.emitInt16(KernelDescriptor.kernarg_preload); + for (uint8_t Res : KernelDescriptor.reserved3) Streamer.emitInt8(Res); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index db43de8fcc5f..55b5246c9210 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -90,6 +90,11 @@ public: /// \returns True on success, false on failure. virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; } + /// \returns True on success, false on failure. + virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) { + return true; + } + virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -154,6 +159,9 @@ public: /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -215,6 +223,9 @@ public: /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index bbbfbe4faa0f..6c539df7677e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -142,11 +142,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, } void R600MCCodeEmitter::emit(uint32_t Value, SmallVectorImpl<char> &CB) const { - support::endian::write(CB, Value, support::little); + support::endian::write(CB, Value, llvm::endianness::little); } void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl<char> &CB) const { - support::endian::write(CB, Value, support::little); + support::endian::write(CB, Value, llvm::endianness::little); } unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td index d924f733624a..240366c8e7da 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -25,6 +25,7 @@ def MIMGEncGfx10Default : MIMGEncoding; def MIMGEncGfx10NSA : MIMGEncoding; def MIMGEncGfx11Default : MIMGEncoding; def MIMGEncGfx11NSA : MIMGEncoding; +def MIMGEncGfx12 : MIMGEncoding; def MIMGEncoding : GenericEnum { let FilterClass = "MIMGEncoding"; @@ -95,11 +96,13 @@ def MIMG { int NOP = -1; } -class mimgopc <int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> { +class mimgopc <int gfx12, int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> { + field bits<8> GFX12 = gfx12; field bits<8> GFX11 = gfx11; field bits<8> GFX10M = gfx10m; // GFX10minus for all but atomics field bits<8> VI = vi; // VI is only used for atomic/sampler/gather instructions field bits<8> SI = si; // SI is only used for atomic instructions + bit HAS_GFX12 = !ne(gfx12, MIMG.NOP); bit HAS_GFX11 = !ne(gfx11, MIMG.NOP); bit HAS_GFX10M = !ne(gfx10m, MIMG.NOP); bit HAS_VI = !ne(vi, MIMG.NOP); @@ -218,6 +221,16 @@ class MIMG <dag outs, string dns = ""> bits<8> VAddrOperands; } +class VIMAGE <dag outs, string dns = ""> : MIMG<outs, dns> { + let MIMG = 0; + let VIMAGE = 1; +} + +class VSAMPLE <dag outs, string dns = ""> : MIMG<outs, dns> { + let MIMG = 0; + let VSAMPLE = 1; +} + def MIMGInfoTable : GenericTable { let FilterClass = "MIMG"; let CppTypeName = "MIMGInfo"; @@ -327,8 +340,8 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> // Base class of all non-NSA gfx11 MIMG instructions. class MIMG_gfx11<int op, dag outs, string dns = ""> : MIMG<outs, dns>, MIMGe_gfx11<op> { - let SubtargetPredicate = isGFX11Plus; - let AssemblerPredicate = isGFX11Plus; + let SubtargetPredicate = isGFX11Only; + let AssemblerPredicate = isGFX11Only; let MIMGEncoding = MIMGEncGfx11Default; let VAddrOperands = 1; @@ -343,8 +356,8 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", list<RegisterClass> addr_types=[], RegisterClass LastAddrRC = VGPR_32> : MIMG<outs, dns>, MIMGe_gfx11<op> { - let SubtargetPredicate = isGFX11Plus; - let AssemblerPredicate = isGFX11Plus; + let SubtargetPredicate = isGFX11Only; + let AssemblerPredicate = isGFX11Only; let MIMGEncoding = MIMGEncGfx11NSA; let VAddrOperands = num_addrs; @@ -359,6 +372,48 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", let nsa = nsah.NSA; } +class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="", + list<RegisterClass> addr_types=[]> + : VIMAGE<outs, dns>, VIMAGEe<op> { + let SubtargetPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Plus; + + let MIMGEncoding = MIMGEncGfx12; + let VAddrOperands = num_addrs; + + MIMGNSAHelper nsah = !if(!empty(addr_types), + MIMGNSAHelper<num_addrs>, + MIMGNSAHelper<num_addrs, addr_types>); + dag AddrIns = nsah.AddrIns; + string AddrAsm = !if(!eq(num_addrs, 1), "$vaddr0", nsah.AddrAsm); + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let vaddr1 = !if(!lt(num_addrs, 2), 0, ?); + let vaddr2 = !if(!lt(num_addrs, 3), 0, ?); + let vaddr3 = !if(!lt(num_addrs, 4), 0, ?); + let vaddr4 = !if(!lt(num_addrs, 5), 0, ?); +} + +class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="", + RegisterClass Addr3RC> + : VSAMPLE<outs, dns>, VSAMPLEe<op> { + let SubtargetPredicate = isGFX12Plus; + let AssemblerPredicate = isGFX12Plus; + + let MIMGEncoding = MIMGEncGfx12; + let VAddrOperands = num_addrs; + + PartialNSAHelper nsah = PartialNSAHelper<num_addrs, 4, Addr3RC>; + + dag AddrIns = nsah.AddrIns; + string AddrAsm = !if(!eq(num_addrs, 1), "$vaddr0", nsah.AddrAsm); + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let vaddr1 = !if(!lt(num_addrs, 2), 0, ?); + let vaddr2 = !if(!lt(num_addrs, 3), 0, ?); + let vaddr3 = !if(!lt(num_addrs, 4), 0, ?); +} + class MIMG_NoSampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, @@ -435,12 +490,41 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode, #!if(BaseOpcode.HasD16, "$d16", ""); } +class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : VIMAGE_gfx12<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC, + int num_addrs, RegisterClass Addr3RC = VGPR_32, + string dns=""> + : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$rsrc), + !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)), + (ins DMask:$dmask, Dim:$dim, UNorm:$unorm, + CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, + LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc" + #!if(BaseOpcode.Sampler, ", $samp", "") + #"$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, - RegisterClass dst_rc, - bit enableDisasm, - bit ExtendedImageInst = 1> { - let ssamp = 0 in { - let VAddrDwords = 1 in { + RegisterClass dst_rc, bit enableDisasm, + bit ExtendedImageInst = 1, + bit isVSample = 0> { + let VAddrDwords = 1 in { + let ssamp = 0 in { if op.HAS_GFX10M then { def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, !if(enableDisasm, "AMDGPU", "")>; @@ -455,8 +539,19 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, !if(enableDisasm, "AMDGPU", "")>; } } - - let VAddrDwords = 2 in { + if op.HAS_GFX12 then { + if isVSample then { + let samp = 0 in + def _V1_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 1>; + } + else { + def _V1_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 1, + !if(enableDisasm, "GFX12", "")>; + } + } + } + let VAddrDwords = 2 in { + let ssamp = 0 in { if op.HAS_GFX10M then { def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; if !not(ExtendedImageInst) then @@ -469,8 +564,18 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>; } } - - let VAddrDwords = 3 in { + if op.HAS_GFX12 then { + if isVSample then { + let samp = 0 in + def _V2_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 2>; + } + else { + def _V2_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 2>; + } + } + } + let VAddrDwords = 3 in { + let ssamp = 0 in { if op.HAS_GFX10M then { def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; if !not(ExtendedImageInst) then @@ -483,8 +588,18 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>; } } - - let VAddrDwords = 4 in { + if op.HAS_GFX12 then { + if isVSample then { + let samp = 0 in + def _V3_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 3>; + } + else { + def _V3_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 3>; + } + } + } + let VAddrDwords = 4 in { + let ssamp = 0 in { if op.HAS_GFX10M then { def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; if !not(ExtendedImageInst) then @@ -499,6 +614,17 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, !if(enableDisasm, "AMDGPU", "")>; } } + if op.HAS_GFX12 then { + if isVSample then { + let samp = 0 in + def _V4_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 4, VGPR_32, + !if(enableDisasm, "GFX12", "")>; + } + else { + def _V4_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 4, + !if(enableDisasm, "GFX12", "")>; + } + } } } @@ -606,62 +732,97 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode, #!if(BaseOpcode.HasD16, "$d16", ""); } +class VIMAGE_Store_gfx12<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> { + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, RegisterClass data_rc, bit enableDisasm> { let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, - DisableWQM = 1, ssamp = 0 in { + DisableWQM = 1 in { let VAddrDwords = 1 in { - if op.HAS_GFX10M then { - def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; - let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "GFX90A", "")>; - def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; + let ssamp = 0 in { + if op.HAS_GFX10M then { + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + let hasPostISelHook = 1 in + def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "GFX90A", "")>; + def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } + if op.HAS_GFX11 then { + def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } } - if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_GFX12 then { + def _V1_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 1, + !if(enableDisasm, "GFX12", "")>; } } let VAddrDwords = 2 in { - if op.HAS_GFX10M then { - def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; - def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>; - def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; - def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; + let ssamp = 0 in { + if op.HAS_GFX10M then { + def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; + def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>; + def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; + def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; + } + if op.HAS_GFX11 then { + def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>; + def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>; + } } - if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>; - def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>; + if op.HAS_GFX12 then { + def _V2_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 2>; } } let VAddrDwords = 3 in { - if op.HAS_GFX10M then { - def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; - def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>; - def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; - def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; + let ssamp = 0 in { + if op.HAS_GFX10M then { + def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; + def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>; + def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; + def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; + } + if op.HAS_GFX11 then { + def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>; + def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>; + } } - if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>; - def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>; + if op.HAS_GFX12 then { + def _V3_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 3>; } } let VAddrDwords = 4 in { - if op.HAS_GFX10M then { - def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; - def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>; - def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; - def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, - !if(enableDisasm, "AMDGPU", "")>; + let ssamp = 0 in { + if op.HAS_GFX10M then { + def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; + def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>; + def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; + def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } + if op.HAS_GFX11 then { + def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>; + def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } } - if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>; - def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4, - !if(enableDisasm, "AMDGPU", "")>; + if op.HAS_GFX12 then { + def _V4_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 4, + !if(enableDisasm, "GFX12", "")>; } } } @@ -788,84 +949,137 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } +class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC, + int num_addrs, bit enableDisasm = 0> + : VIMAGE_gfx12<!cast<int>(op.GFX12), (outs DataRC:$vdst), num_addrs, + !if(enableDisasm, "GFX12", "")> { + let Constraints = "$vdst = $vdata"; + + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim, + CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe"; +} + +class VIMAGE_Atomic_gfx12_Renamed<mimgopc op, string opcode, string renamed, + RegisterClass DataRC, int num_addrs, + bit enableDisasm = 0> + : VIMAGE_Atomic_gfx12<op, renamed, DataRC, num_addrs, enableDisasm>, + MnemonicAlias<opcode, renamed>, Requires<[isGFX12Plus]>; + multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterClass data_rc, bit enableDasm = 0, - bit isFP = 0> { + bit isFP = 0, + string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - ssamp = 0, FPAtomic = isFP in { + FPAtomic = isFP in { let VAddrDwords = 1 in { - if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; - } - if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; - let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + let ssamp = 0 in { + if op.HAS_SI then { + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + } + if op.HAS_VI then { + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + let hasPostISelHook = 1 in + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + } + if op.HAS_GFX10M then { + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + } + if op.HAS_GFX11 then { + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + } } - if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; - } - if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + if op.HAS_GFX12 then { + if !empty(renamed) then + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, enableDasm>; + else + def _V1_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 1, enableDasm>; } } let VAddrDwords = 2 in { - if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + let ssamp = 0 in { + if op.HAS_SI then { + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + } + if op.HAS_VI then { + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>; + } + if op.HAS_GFX10M then { + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + } + if op.HAS_GFX11 then { + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + } } - if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>; - } - if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; - } - if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + if op.HAS_GFX12 then { + if !empty(renamed) then + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, 0>; + else + def _V2_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 2, 0>; } } let VAddrDwords = 3 in { - if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; - } - if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>; - } - if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + let ssamp = 0 in { + if op.HAS_SI then { + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + } + if op.HAS_VI then { + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>; + } + if op.HAS_GFX10M then { + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + } + if op.HAS_GFX11 then { + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + } } - if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + if op.HAS_GFX12 then { + if !empty(renamed) then + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, 0>; + else + def _V3_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 3, 0>; } } let VAddrDwords = 4 in { - if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; - } - if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>; + let ssamp = 0 in { + if op.HAS_SI then { + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + } + if op.HAS_VI then { + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>; + } + if op.HAS_GFX10M then { + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + } + if op.HAS_GFX11 then { + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + } } - if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; - } - if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + if op.HAS_GFX12 then { + if !empty(renamed) then + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, enableDasm>; + else + def _V4_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 4, enableDasm>; } } } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { // 64-bit atomics let IsAtomicRet = 1 in { def "" : MIMGBaseOpcode { let Atomic = 1; @@ -877,13 +1091,17 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> // using dmask and tfe. Only 32-bit variant is registered with disassembler. // Other variants are reconstructed by disassembler using dmask and tfe. let VDataDwords = !if(isCmpSwap, 2, 1) in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP, renamed>; let VDataDwords = !if(isCmpSwap, 4, 2) in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP, renamed>; } } // End IsAtomicRet = 1 } +multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, + bit isCmpSwap = 0, bit isFP = 0> + : MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>; + class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> : MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> { @@ -1006,7 +1224,7 @@ class MIMGAddrSizes_dw_range<list<int> range> { } class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16, - int nsa_max_addr = 5> { + int nsa_max_addr = 5, bit includeNSA1 = 0> { // List of all possible numbers of address words, taking all combinations of // A16 and image dimension into account (note: no MSAA, since this is for // sample/gather ops). @@ -1061,8 +1279,10 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16, // it is the only one that could have a register other than VGPR32. int EnableDisasmNum = !foldl(!head(AllNumAddrWords), !tail(AllNumAddrWords), acc, var, !if(!le(var, nsa_max_addr), var, acc)); + list<int> PossibleVariants = + !listconcat([12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], !if(includeNSA1, [1], [])); list<LastVAddrSize> PartialNSAInstrs = - !foldl([]<LastVAddrSize>, [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], lhs, dw, + !foldl([]<LastVAddrSize>, PossibleVariants, lhs, dw, !if(isIntInList<dw, AllNumAddrWords>.ret, !listconcat(lhs, [LastVAddrSize<dw, !sub(nsa_max_addr, 1), !eq(dw, EnableDisasmNum)>]), @@ -1114,6 +1334,16 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, } } } + + foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 4/*MaxNSASize*/, 1>.PartialNSAInstrs in { + let VAddrDwords = addr.NumWords in { + if op.HAS_GFX12 then { + def _V # addr.NumWords # _gfx12 + : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, addr.NumWords, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "GFX12", "")>; + } + } + } } class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample> @@ -1177,12 +1407,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> { RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass; int VAddrDwords = !srl(RegClass.Size, 5); - int gfx11_nsa_addrs = !if(IsA16, 4, 5); + int GFX11PlusNSAAddrs = !if(IsA16, 4, 5); RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32); - list<RegisterClass> gfx11_addr_types = - !if(IsA16, - [node_ptr_type, VGPR_32, VReg_96, VReg_96], - [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); + list<RegisterClass> GFX11PlusAddrTypes = + !if(IsA16, + [node_ptr_type, VGPR_32, VReg_96, VReg_96], + [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); } class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> @@ -1215,6 +1445,14 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } +class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs, + list<RegisterClass> addr_types> + : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata), + num_addrs, "GFX12", addr_types> { + let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16)); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16"; +} + multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> { defvar info = MIMG_IntersectRay_Helper<Is64, IsA16>; def "" : MIMGBaseOpcode { @@ -1222,30 +1460,39 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> { let A16 = IsA16; } let dmask = 0xf, - unorm = 1, d16 = 0, cpol = 0, tfe = 0, - lwe = 0, r128 = 1, - ssamp = 0, dim = {0, 0, 0}, a16 = IsA16, d16 = 0, BaseOpcode = !cast<MIMGBaseOpcode>(NAME), VDataDwords = 4 in { - def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> { - let VAddrDwords = info.VAddrDwords; - } - def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> { - let VAddrDwords = info.VAddrDwords; - } - def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> { - let VAddrDwords = info.num_addrs; + let unorm = 1, + lwe = 0, + ssamp = 0 in { + if op.HAS_GFX10M then + def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> { + let VAddrDwords = info.VAddrDwords; + } + if op.HAS_GFX11 then + def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> { + let VAddrDwords = info.VAddrDwords; + } + if op.HAS_GFX10M then + def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> { + let VAddrDwords = info.num_addrs; + } + if op.HAS_GFX11 then + def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode, + info.GFX11PlusNSAAddrs, + info.GFX11PlusAddrTypes> { + let VAddrDwords = info.num_addrs; + } } - def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode, - info.gfx11_nsa_addrs, - info.gfx11_addr_types> { + def _gfx12 : VIMAGE_IntersectRay_gfx12<op, opcode, info.GFX11PlusNSAAddrs, + info.GFX11PlusAddrTypes> { let VAddrDwords = info.num_addrs; } } @@ -1261,13 +1508,13 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> { let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in { let VDataDwords = 2 in - defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0>; /* packed D16 */ + defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0, 0, 1>; /* packed D16 */ let VDataDwords = 3 in - defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0>; /* packed D16 + tfe */ + defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0, 0, 1>; /* packed D16 + tfe */ let VDataDwords = 4 in - defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1>; + defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1, 0, 1>; let VDataDwords = 5 in - defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0>; + defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0, 0, 1>; } } @@ -1276,143 +1523,143 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> { //===----------------------------------------------------------------------===// let OtherPredicates = [HasImageInsts] in { -defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00>, "image_load", 1>; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01>, "image_load_mip", 1, 1>; -defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02>, "image_load_pck", 0>; -defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03>, "image_load_pck_sgn", 0>; -defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04>, "image_load_mip_pck", 0, 1>; -defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>; -defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x08>, "image_store", 1>; -defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x09>, "image_store_mip", 1, 1>; -defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x0a>, "image_store_pck", 0>; -defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x0b>, "image_store_mip_pck", 0, 1>; - -defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x0e>, "image_get_resinfo", 0, 1, 1>; - -defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">; -defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>; -defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x0c, 0x11, 0x12, 0x11>, "image_atomic_add">; -defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub">; -defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">; -defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x0e, 0x14>, "image_atomic_smin">; -defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x0f, 0x15>, "image_atomic_umin">; -defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x10, 0x16>, "image_atomic_smax">; -defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x11, 0x17>, "image_atomic_umax">; -defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x18>, "image_atomic_and">; -defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x19>, "image_atomic_or">; -defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x1a>, "image_atomic_xor">; -defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x15, 0x1b>, "image_atomic_inc">; -defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x16, 0x1c>, "image_atomic_dec">; -defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; -defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; -defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; - -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x20>, AMDGPUSample>; +defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00, 0x00>, "image_load", 1>; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01, 0x01>, "image_load_mip", 1, 1>; +defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02, 0x02>, "image_load_pck", 0>; +defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03, 0x03>, "image_load_pck_sgn", 0>; +defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04, 0x04>, "image_load_mip_pck", 0, 1>; +defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>; +defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x06, 0x08>, "image_store", 1>; +defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x07, 0x09>, "image_store_mip", 1, 1>; +defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x08, 0x0a>, "image_store_pck", 0>; +defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x09, 0x0b>, "image_store_mip_pck", 0, 1>; + +defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x17, 0x0e, 0x0e, 0x0e>, "image_get_resinfo", 0, 1, 1>; + +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic_Renamed <mimgopc<0x0c, 0x0c, 0x11, 0x12, 0x11>, "image_atomic_add", "image_atomic_add_uint">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic_Renamed <mimgopc<0x0d, 0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub", "image_atomic_sub_uint">; +defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">; +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic_Renamed <mimgopc<0x0e, 0x0e, 0x14>, "image_atomic_smin", "image_atomic_min_int">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic_Renamed <mimgopc<0x0f, 0x0f, 0x15>, "image_atomic_umin", "image_atomic_min_uint">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic_Renamed <mimgopc<0x10, 0x10, 0x16>, "image_atomic_smax", "image_atomic_max_int">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic_Renamed <mimgopc<0x11, 0x11, 0x17>, "image_atomic_umax", "image_atomic_max_uint">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x12, 0x18>, "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x13, 0x19>, "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x14, 0x1a>, "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic_Renamed <mimgopc<0x15, 0x15, 0x1b>, "image_atomic_inc", "image_atomic_inc_uint">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c>, "image_atomic_dec", "image_atomic_dec_uint">; +defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; +defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; +defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; + +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>; let OtherPredicates = [HasExtendedImageInsts] in { -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x21>, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x22>, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x23>, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x24>, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x25>, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x26>, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x27>, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x28>, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x29>, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x2a>, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x2b>, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x2c>, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x2d>, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x2e>, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x2f>, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x30>, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x31>, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x32>, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x33>, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x34>, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x35>, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x36>, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x37>, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x38>, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x39>, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x3a>, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x3c>, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x3d>, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x3f>, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x40>, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x41>, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x44>, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x45>, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x46>, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x47>, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x48>, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x49>, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x4c>, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x4d>, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x4e>, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x4f>, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x50>, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x51>, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x54>, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x55>, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x57>, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x58>, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x5f>, AMDGPUSample_c_lz_o>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x41, 0x23>, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x1e, 0x25>, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x42, 0x26>, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x20, 0x28>, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x43, 0x29>, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x21, 0x2a>, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x44, 0x2b>, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x22, 0x2c>, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x23, 0x2d>, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x45, 0x2e>, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x24, 0x2f>, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x25, 0x30>, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x46, 0x31>, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x26, 0x32>, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x47, 0x33>, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x27, 0x34>, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x28, 0x35>, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x48, 0x36>, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x29, 0x37>, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x2a, 0x38>, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x49, 0x39>, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x2b, 0x3a>, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x2c, 0x3c>, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x2d, 0x3d>, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x2e, 0x3f>, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x2f, 0x40>, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x60, 0x41>, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x30, 0x44>, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x31, 0x45>, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x61, 0x46>, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x32, 0x47>, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x33, 0x48>, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x62, 0x49>, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x63, 0x4c>, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x64, 0x4d>, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x65, 0x4e>, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x34, 0x4f>, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x35, 0x50>, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x51>, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x54>, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x55>, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x36, 0x57>, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x58>, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x37, 0x5f>, AMDGPUSample_c_lz_o>; let SubtargetPredicate = isGFX9Plus in -defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">; - -defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">; - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x68>, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">; + +defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x68>, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>; } // End OtherPredicates = [HasExtendedImageInsts] let OtherPredicates = [HasExtendedImageInsts,HasG16] in { -defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0xa2>, AMDGPUSample_d, 0, 1>; -defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>; -defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>; -defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>; -defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>; } // End OtherPredicates = [HasExtendedImageInsts,HasG16] //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>; let SubtargetPredicate = isGFX10Only, OtherPredicates = [HasGFX10_AEncoding] in -defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>; +defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>; let OtherPredicates = [HasGFX10_AEncoding] in -defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, MIMG.NOP>, "image_msaa_load">; +defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, 0x18, MIMG.NOP>, "image_msaa_load">; let OtherPredicates = [HasGFX10_AEncoding] in { -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>; } // End OtherPredicates = [HasGFX10_AEncoding] } // End let OtherPredicates = [HasImageInsts] diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h index 2b0a887c61fa..6c40c2813e20 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h @@ -27,7 +27,7 @@ FunctionPass *createR600ClauseMergePass(); FunctionPass *createR600Packetizer(); FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createR600MachineCFGStructurizerPass(); -FunctionPass *createR600ISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel); +FunctionPass *createR600ISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel); ModulePass *createR600OpenCLImageTypeLoweringPass(); void initializeR600ClauseMergePassPass(PassRegistry &); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp index 20c2ff8a4fd7..293db13f34f6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp @@ -30,7 +30,7 @@ class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { public: R600DAGToDAGISel() = delete; - explicit R600DAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) + explicit R600DAGToDAGISel(TargetMachine &TM, CodeGenOptLevel OptLevel) : AMDGPUDAGToDAGISel(TM, OptLevel) {} void Select(SDNode *N) override; @@ -183,6 +183,6 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, /// This pass converts a legalized DAG into a R600-specific // DAG, ready for instruction scheduling. FunctionPass *llvm::createR600ISelDag(TargetMachine &TM, - CodeGenOpt::Level OptLevel) { + CodeGenOptLevel OptLevel) { return new R600DAGToDAGISel(TM, OptLevel); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index ad072cfe23b1..c1ba9c514874 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -101,7 +101,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f32, Expand); - setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, + setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td index f4dfbe8adc75..f82bd55beccc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -782,7 +782,7 @@ def SETNE_DX10 : R600_2OP < def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; -def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", froundeven>; def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; def MOV : R600_1OP <0x19, "MOV", []>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index 2a15c0123b74..195dc4f9a0f4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -163,11 +163,11 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass { Value *Replacement = nullptr; StringRef Name = F->getName(); - if (Name.startswith(GetImageResourceIDFunc)) { + if (Name.starts_with(GetImageResourceIDFunc)) { Replacement = ConstantInt::get(Int32Type, ResourceID); - } else if (Name.startswith(GetImageSizeFunc)) { + } else if (Name.starts_with(GetImageSizeFunc)) { Replacement = &ImageSizeArg; - } else if (Name.startswith(GetImageFormatFunc)) { + } else if (Name.starts_with(GetImageFormatFunc)) { Replacement = &ImageFormatArg; } else { continue; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index 36840587d219..6cd4fd42444d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -53,7 +53,7 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, TargetOptions Options, std::optional<Reloc::Model> RM, std::optional<CodeModel::Model> CM, - CodeGenOpt::Level OL, bool JIT) + CodeGenOptLevel OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { setRequiresStructuredCFG(true); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h index f0e3cd352642..3fe54c778fe1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h @@ -33,7 +33,7 @@ public: R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, std::optional<Reloc::Model> RM, - std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL, + std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index b87cd8c66cc8..932c0d6216ce 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -206,9 +207,12 @@ bool SIAnnotateControlFlow::openIf(BranchInst *Term) { if (isUniform(Term)) return false; - Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); + IRBuilder<> IRB(Term); + Value *IfCall = IRB.CreateCall(If, {Term->getCondition()}); + Value *Cond = IRB.CreateExtractValue(IfCall, {0}); + Value *Mask = IRB.CreateExtractValue(IfCall, {1}); + Term->setCondition(Cond); + push(Term->getSuccessor(1), Mask); return true; } @@ -217,15 +221,24 @@ bool SIAnnotateControlFlow::insertElse(BranchInst *Term) { if (isUniform(Term)) { return false; } - Value *Ret = CallInst::Create(Else, popSaved(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); + + IRBuilder<> IRB(Term); + Value *ElseCall = IRB.CreateCall(Else, {popSaved()}); + Value *Cond = IRB.CreateExtractValue(ElseCall, {0}); + Value *Mask = IRB.CreateExtractValue(ElseCall, {1}); + Term->setCondition(Cond); + push(Term->getSuccessor(1), Mask); return true; } /// Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition( Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) { + + auto CreateBreak = [this, Cond, Broken](Instruction *I) -> CallInst * { + return IRBuilder<>(I).CreateCall(IfBreak, {Cond, Broken}); + }; + if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); Instruction *Insert; @@ -235,8 +248,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition( Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); } - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Insert); + return CreateBreak(Insert); } // Insert IfBreak in the loop header TERM for constant COND other than true. @@ -244,14 +256,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition( Instruction *Insert = Cond == BoolTrue ? Term : L->getHeader()->getTerminator(); - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Insert); + return CreateBreak(Insert); } if (isa<Argument>(Cond)) { Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Insert); + return CreateBreak(Insert); } llvm_unreachable("Unhandled loop condition!"); @@ -268,7 +278,8 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { return false; BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front()); + PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken"); + Broken->insertBefore(Target->begin()); Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); @@ -286,7 +297,8 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Broken->addIncoming(PHIValue, Pred); } - Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + CallInst *LoopCall = IRBuilder<>(Term).CreateCall(Loop, {Arg}); + Term->setCondition(LoopCall); push(Term->getSuccessor(0), Arg); @@ -325,7 +337,7 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { // Split edge to make Def dominate Use FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); } - CallInst::Create(EndCf, Exec, "", FirstInsertionPt); + IRBuilder<>(FirstInsertionPt).CreateCall(EndCf, {Exec}); } return true; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h index cd1818285e3e..b291400a947c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h @@ -44,6 +44,7 @@ enum { GFX90A = 8, GFX940 = 9, GFX11 = 10, + GFX12 = 11, }; } @@ -80,19 +81,21 @@ enum : uint64_t { MTBUF = 1 << 18, SMRD = 1 << 19, MIMG = 1 << 20, - EXP = 1 << 21, - FLAT = 1 << 22, - DS = 1 << 23, + VIMAGE = 1 << 21, + VSAMPLE = 1 << 22, + EXP = 1 << 23, + FLAT = 1 << 24, + DS = 1 << 25, // Pseudo instruction formats. - VGPRSpill = 1 << 24, - SGPRSpill = 1 << 25, + VGPRSpill = 1 << 26, + SGPRSpill = 1 << 27, // LDSDIR instruction format. - LDSDIR = 1 << 26, + LDSDIR = 1 << 28, // VINTERP instruction format. - VINTERP = 1 << 27, + VINTERP = 1 << 29, // High bits - other information. VM_CNT = UINT64_C(1) << 32, @@ -161,6 +164,9 @@ enum : uint64_t { // Is never uniform. IsNeverUniform = UINT64_C(1) << 61, + + // ds_gws_* instructions. + GWS = UINT64_C(1) << 62, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -207,6 +213,9 @@ enum OperandType : unsigned { OPERAND_REG_INLINE_C_V2INT32, OPERAND_REG_INLINE_C_V2FP32, + // Operand for split barrier inline constant + OPERAND_INLINE_SPLIT_BARRIER_INT32, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16, @@ -326,13 +335,20 @@ enum : unsigned { LITERAL_CONST = 255, VGPR_MIN = 256, VGPR_MAX = 511, - IS_VGPR = 256 // Indicates VGPR or AGPR + IS_VGPR = 256, // Indicates VGPR or AGPR }; } // namespace EncValues -} // namespace AMDGPU -namespace AMDGPU { +// Register codes as defined in the TableGen's HWEncoding field. +namespace HWEncoding { +enum : unsigned { + REG_IDX_MASK = 0xff, + IS_VGPR_OR_AGPR = 1 << 8, + IS_HI = 1 << 9, // High 16-bit register. +}; +} // namespace HWEncoding + namespace CPol { enum CPol { @@ -343,7 +359,47 @@ enum CPol { SC0 = GLC, SC1 = SCC, NT = SLC, - ALL = GLC | SLC | DLC | SCC + ALL_pregfx12 = GLC | SLC | DLC | SCC, + SWZ_pregfx12 = 8, + + // Below are GFX12+ cache policy bits + + // Temporal hint + TH = 0x7, // All TH bits + TH_RT = 0, // regular + TH_NT = 1, // non-temporal + TH_HT = 2, // high-temporal + TH_LU = 3, // last use + TH_RT_WB = 3, // regular (CU, SE), high-temporal with write-back (MALL) + TH_NT_RT = 4, // non-temporal (CU, SE), regular (MALL) + TH_RT_NT = 5, // regular (CU, SE), non-temporal (MALL) + TH_NT_HT = 6, // non-temporal (CU, SE), high-temporal (MALL) + TH_NT_WB = 7, // non-temporal (CU, SE), high-temporal with write-back (MALL) + TH_BYPASS = 3, // only to be used with scope = 3 + + TH_RESERVED = 7, // unused value for load insts + + // Bits of TH for atomics + TH_ATOMIC_RETURN = GLC, // Returning vs non-returning + TH_ATOMIC_NT = SLC, // Non-temporal vs regular + TH_ATOMIC_CASCADE = 4, // Cascading vs regular + + // Scope + SCOPE = 0x3 << 3, // All Scope bits + SCOPE_CU = 0 << 3, + SCOPE_SE = 1 << 3, + SCOPE_DEV = 2 << 3, + SCOPE_SYS = 3 << 3, + + SWZ = 1 << 6, // Swizzle bit + + ALL = TH | SCOPE, + + // Helper bits + TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy + TH_TYPE_STORE = 1 << 8, // TH_STORE policy + TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy + TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not }; } // namespace CPol @@ -360,8 +416,8 @@ enum Id { // Message ID, width(4) [3:0]. ID_DEALLOC_VGPRS_GFX11Plus = 3, // reused in GFX11 ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11 - ID_STALL_WAVE_GEN = 5, // added in GFX9 - ID_HALT_WAVES = 6, // added in GFX9 + ID_STALL_WAVE_GEN = 5, // added in GFX9, removed in GFX12 + ID_HALT_WAVES = 6, // added in GFX9, removed in GFX12 ID_ORDERED_PS_DONE = 7, // added in GFX9, removed in GFX11 ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 @@ -375,6 +431,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_RTN_GET_REALTIME = 131, ID_RTN_SAVE_WAVE = 132, ID_RTN_GET_TBA = 133, + ID_RTN_GET_SE_AID_ID = 134, ID_MASK_PreGFX11_ = 0xF, ID_MASK_GFX11Plus_ = 0xFF @@ -425,6 +482,9 @@ enum Id { // HwRegCode, (6) [5:0] ID_GPR_ALLOC = 5, ID_LDS_ALLOC = 6, ID_IB_STS = 7, + ID_PERF_SNAPSHOT_DATA_gfx12 = 10, + ID_PERF_SNAPSHOT_PC_LO_gfx12 = 11, + ID_PERF_SNAPSHOT_PC_HI_gfx12 = 12, ID_MEM_BASES = 15, ID_TBA_LO = 16, ID_TBA_HI = 17, @@ -436,12 +496,23 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID1 = 23, ID_HW_ID2 = 24, ID_POPS_PACKER = 25, - ID_PERF_SNAPSHOT_DATA = 27, + ID_PERF_SNAPSHOT_DATA_gfx11 = 27, ID_SHADER_CYCLES = 29, - - // Register numbers reused in GFX11+ - ID_PERF_SNAPSHOT_PC_LO = 18, - ID_PERF_SNAPSHOT_PC_HI = 19, + ID_SHADER_CYCLES_HI = 30, + ID_DVGPR_ALLOC_LO = 31, + ID_DVGPR_ALLOC_HI = 32, + + // Register numbers reused in GFX11 + ID_PERF_SNAPSHOT_PC_LO_gfx11 = 18, + ID_PERF_SNAPSHOT_PC_HI_gfx11 = 19, + + // Register numbers reused in GFX12+ + ID_STATE_PRIV = 4, + ID_PERF_SNAPSHOT_DATA1 = 15, + ID_PERF_SNAPSHOT_DATA2 = 16, + ID_EXCP_FLAG_PRIV = 17, + ID_EXCP_FLAG_USER = 18, + ID_TRAP_CTRL = 19, // GFX940 specific registers ID_XCC_ID = 20, @@ -958,6 +1029,14 @@ enum Register_Flag : uint8_t { } // namespace AMDGPU +namespace AMDGPU { +namespace Barrier { +enum Type { TRAP = -2, WORKGROUP = -1 }; +} // namespace Barrier +} // namespace AMDGPU + +// clang-format off + #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) @@ -1050,6 +1129,9 @@ enum Register_Flag : uint8_t { #define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) #define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) #define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_RR_WG_MODE(x) (((x) & 0x1) << 21) +#define G_00B848_RR_WG_MODE(x) (((x) >> 21) & 0x1) +#define C_00B848_RR_WG_MODE 0xFFDFFFFF #define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) #define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) #define C_00B848_DEBUG_MODE 0xFFBFFFFF @@ -1066,7 +1148,6 @@ enum Register_Flag : uint8_t { #define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1) #define C_00B848_FWD_PROGRESS 0x7FFFFFFF - // Helpers for setting FLOAT_MODE #define FP_ROUND_ROUND_TO_NEAREST 0 #define FP_ROUND_ROUND_TO_INF 1 @@ -1108,6 +1189,9 @@ enum Register_Flag : uint8_t { #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 + +// clang-format on + } // End namespace llvm #endif diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index db323465c153..86980ee851bb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -88,7 +88,7 @@ public: // VGPR to SGPR copy being processed MachineInstr *Copy; // All SALU instructions reachable from this copy in SSA graph - DenseSet<MachineInstr *> SChain; + SetVector<MachineInstr *> SChain; // Number of SGPR to VGPR copies that are used to put the SALU computation // results back to VALU. unsigned NumSVCopies; @@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { SmallVector<MachineInstr*, 4> PHINodes; SmallVector<MachineInstr*, 4> S2VCopies; unsigned NextVGPRToSGPRCopyID; - DenseMap<unsigned, V2SCopyInfo> V2SCopies; + MapVector<unsigned, V2SCopyInfo> V2SCopies; DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; public: @@ -152,6 +152,13 @@ public: void processPHINode(MachineInstr &MI); + // Check if MO is an immediate materialized into a VGPR, and if so replace it + // with an SGPR immediate. The VGPR immediate is also deleted if it does not + // have any other uses. + bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, + MachineBasicBlock *BlockToInsertTo, + MachineBasicBlock::iterator PointToInsertTo); + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -350,7 +357,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, return false; // FIXME: Handle copies with sub-regs. - if (Copy->getOperand(0).getSubReg()) + if (Copy->getOperand(1).getSubReg()) return false; switch (MoveImm->getOpcode()) { @@ -360,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: - SMovOp = AMDGPU::S_MOV_B64; + SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; break; } Imm = ImmOp->getImm(); @@ -662,13 +669,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { : MBB; MachineBasicBlock::iterator PointToInsertCopy = MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; - MachineInstr *NewCopy = - BuildMI(*BlockToInsertCopy, PointToInsertCopy, - PointToInsertCopy->getDebugLoc(), - TII->get(AMDGPU::COPY), NewDst) - .addReg(MO.getReg()); - MO.setReg(NewDst); - analyzeVGPRToSGPRCopy(NewCopy); + + if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy, + PointToInsertCopy)) { + MachineInstr *NewCopy = + BuildMI(*BlockToInsertCopy, PointToInsertCopy, + PointToInsertCopy->getDebugLoc(), + TII->get(AMDGPU::COPY), NewDst) + .addReg(MO.getReg()); + MO.setReg(NewDst); + analyzeVGPRToSGPRCopy(NewCopy); + } } } } @@ -765,7 +776,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { for (auto MI : PHINodes) { processPHINode(*MI); } - if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) + if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); SiblingPenalty.clear(); @@ -829,6 +840,32 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } } +bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( + MachineOperand &MaybeVGPRConstMO, Register DstReg, + MachineBasicBlock *BlockToInsertTo, + MachineBasicBlock::iterator PointToInsertTo) { + + MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg()); + if (!DefMI || !DefMI->isMoveImmediate()) + return false; + + MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0); + if (SrcConst->isReg()) + return false; + + const TargetRegisterClass *SrcRC = + MRI->getRegClass(MaybeVGPRConstMO.getReg()); + unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); + unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(), + TII->get(MoveOp), DstReg) + .add(*SrcConst); + if (MRI->hasOneUse(MaybeVGPRConstMO.getReg())) + DefMI->eraseFromParent(); + MaybeVGPRConstMO.setReg(DstReg); + return true; +} + bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I) { Register DstReg = MI.getOperand(0).getReg(); @@ -846,25 +883,10 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) .add(MI.getOperand(1)); MI.getOperand(1).setReg(TmpReg); - } else { - MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - if (DefMI && DefMI->isMoveImmediate()) { - MachineOperand SrcConst = DefMI->getOperand(AMDGPU::getNamedOperandIdx( - DefMI->getOpcode(), AMDGPU::OpName::src0)); - if (!SrcConst.isReg()) { - const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); - unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); - unsigned MoveOp = - MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MoveOp), - DstReg) - .add(SrcConst); - I = std::next(I); - if (MRI->hasOneUse(SrcReg)) - DefMI->eraseFromParent(); - MI.eraseFromParent(); - } - } + } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), + MI)) { + I = std::next(I); + MI.eraseFromParent(); } return true; } @@ -966,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { for (auto J : Info->Siblings) { auto InfoIt = V2SCopies.find(J); if (InfoIt != V2SCopies.end()) { - MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + MachineInstr *SiblingCopy = InfoIt->second.Copy; if (SiblingCopy->isImplicitDef()) // the COPY has already been MoveToVALUed continue; @@ -1001,15 +1023,15 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); if (CurInfoIt != V2SCopies.end()) { - V2SCopyInfo C = CurInfoIt->getSecond(); + V2SCopyInfo C = CurInfoIt->second; LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump()); for (auto S : C.Siblings) { auto SibInfoIt = V2SCopies.find(S); if (SibInfoIt != V2SCopies.end()) { - V2SCopyInfo &SI = SibInfoIt->getSecond(); + V2SCopyInfo &SI = SibInfoIt->second; LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump()); if (!SI.NeedToBeConvertedToVALU) { - set_subtract(SI.SChain, C.SChain); + SI.SChain.set_subtract(C.SChain); if (needToBeConvertedToVALU(&SI)) LoweringWorklist.push_back(SI.ID); } @@ -1018,6 +1040,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); + // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if + // instead. V2SCopies.erase(C.ID); Copies.insert(C.Copy); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9f1d6038f1b6..709de612d81d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -80,6 +80,10 @@ public: bool updateOperand(FoldCandidate &Fold) const; + bool canUseImmWithOpSel(FoldCandidate &Fold) const; + + bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; + bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold) const; @@ -196,61 +200,86 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { +bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); - assert(Old.isReg()); + const uint64_t TSFlags = MI->getDesc().TSFlags; + assert(Old.isReg() && Fold.isImm()); - const uint64_t TSFlags = MI->getDesc().TSFlags; - if (Fold.isImm()) { - if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) && - (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) && - AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, - ST->hasInv2PiInlineImm())) { - // Set op_sel/op_sel_hi on this operand or bail out if op_sel is - // already set. - unsigned Opcode = MI->getOpcode(); - int OpNo = MI->getOperandNo(&Old); - int ModIdx = -1; - if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) - ModIdx = AMDGPU::OpName::src0_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) - ModIdx = AMDGPU::OpName::src1_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) - ModIdx = AMDGPU::OpName::src2_modifiers; - assert(ModIdx != -1); - ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); - MachineOperand &Mod = MI->getOperand(ModIdx); - unsigned Val = Mod.getImm(); - if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) { - // Only apply the following transformation if that operand requires - // a packed immediate. - switch (TII->get(Opcode).operands()[OpNo].OperandType) { - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); - return true; - } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; - } - break; - default: - break; - } - } - } + if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || + (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || + isUInt<16>(Fold.ImmToFold) || + !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) + return false; + + unsigned Opcode = MI->getOpcode(); + int OpNo = MI->getOperandNo(&Old); + uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + switch (OpType) { + default: + return false; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + break; } + return true; +} + +bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + unsigned Opcode = MI->getOpcode(); + int OpNo = MI->getOperandNo(&Old); + + // Set op_sel/op_sel_hi on this operand or bail out if op_sel is + // already set. + int ModIdx = -1; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + ModIdx = AMDGPU::OpName::src0_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + ModIdx = AMDGPU::OpName::src1_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + ModIdx = AMDGPU::OpName::src2_modifiers; + assert(ModIdx != -1); + ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); + MachineOperand &Mod = MI->getOperand(ModIdx); + unsigned Val = Mod.getImm(); + if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + return false; + + // Only apply the following transformation if that operand requires + // a packed immediate. + // If upper part is all zero we do not need op_sel_hi. + if (!(Fold.ImmToFold & 0xffff)) { + MachineOperand New = + MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } + MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); + return true; +} + +bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm() && canUseImmWithOpSel(Fold)) + return tryFoldImmWithOpSel(Fold); + if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16); @@ -345,9 +374,50 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold) const { - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + const unsigned Opc = MI->getOpcode(); + + auto tryToFoldAsFMAAKorMK = [&]() { + if (!OpToFold->isImm()) + return false; + + const bool TryAK = OpNo == 3; + const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32; + MI->setDesc(TII->get(NewOpc)); + + // We have to fold into operand which would be Imm not into OpNo. + bool FoldAsFMAAKorMK = + tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold); + if (FoldAsFMAAKorMK) { + // Untie Src2 of fmac. + MI->untieRegOperand(3); + // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1. + if (OpNo == 1) { + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + Register OldReg = Op1.getReg(); + // Operand 2 might be an inlinable constant + if (Op2.isImm()) { + Op1.ChangeToImmediate(Op2.getImm()); + Op2.ChangeToRegister(OldReg, false); + } else { + Op1.setReg(Op2.getReg()); + Op2.setReg(OldReg); + } + } + return true; + } + MI->setDesc(TII->get(Opc)); + return false; + }; + + bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold); + if (!IsLegal && OpToFold->isImm()) { + FoldCandidate Fold(MI, OpNo, OpToFold); + IsLegal = canUseImmWithOpSel(Fold); + } + + if (!IsLegal) { // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 - unsigned Opc = MI->getOpcode(); unsigned NewOpc = macToMad(Opc); if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { // Check if changing this to a v_mad_{f16, f32} instruction will allow us @@ -367,6 +437,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MI->setDesc(TII->get(Opc)); } + // Special case for s_fmac_f32 if we are trying to fold into Src2. + // By transforming into fmaak we can untie Src2 and make folding legal. + if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) { + if (tryToFoldAsFMAAKorMK()) + return true; + } + // Special case for s_setreg_b32 if (OpToFold->isImm()) { unsigned ImmOpc = 0; @@ -387,66 +464,72 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if (isUseMIInFoldList(FoldList, MI)) return false; - unsigned CommuteOpNo = OpNo; - // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. - unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; - unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; - bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); - - if (CanCommute) { - if (CommuteIdx0 == OpNo) - CommuteOpNo = CommuteIdx1; - else if (CommuteIdx1 == OpNo) - CommuteOpNo = CommuteIdx0; - } - + unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex; + bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo); + if (!CanCommute) + return false; // One of operands might be an Imm operand, and OpNo may refer to it after // the call of commuteInstruction() below. Such situations are avoided // here explicitly as OpNo must be a register operand to be a candidate // for memory folding. - if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || - !MI->getOperand(CommuteIdx1).isReg())) + if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg()) return false; - if (!CanCommute || - !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) + if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo)) return false; + int Op32 = -1; if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { - if ((Opc == AMDGPU::V_ADD_CO_U32_e64 || - Opc == AMDGPU::V_SUB_CO_U32_e64 || - Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME - (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { - - // Verify the other operand is a VGPR, otherwise we would violate the - // constant bus restriction. - unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0; - MachineOperand &OtherOp = MI->getOperand(OtherIdx); - if (!OtherOp.isReg() || - !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg())) - return false; - - assert(MI->getOperand(1).isDef()); + if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && + Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME + (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { + TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo); + return false; + } - // Make sure to get the 32-bit version of the commuted opcode. - unsigned MaybeCommutedOpc = MI->getOpcode(); - int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); + // Verify the other operand is a VGPR, otherwise we would violate the + // constant bus restriction. + MachineOperand &OtherOp = MI->getOperand(OpNo); + if (!OtherOp.isReg() || + !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg())) + return false; - appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); - return true; - } + assert(MI->getOperand(1).isDef()); - TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); - return false; + // Make sure to get the 32-bit version of the commuted opcode. + unsigned MaybeCommutedOpc = MI->getOpcode(); + Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); } - appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); return true; } + // Inlineable constant might have been folded into Imm operand of fmaak or + // fmamk and we are trying to fold a non-inlinable constant. + if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) && + !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) { + unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2; + MachineOperand &OpImm = MI->getOperand(ImmIdx); + if (!OpImm.isReg() && + TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm)) + return tryToFoldAsFMAAKorMK(); + } + + // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1. + // By changing into fmamk we can untie Src2. + // If folding for Src0 happens first and it is identical operand to Src1 we + // should avoid transforming into fmamk which requires commuting as it would + // cause folding into Src1 to fail later on due to wrong OpNo used. + if (Opc == AMDGPU::S_FMAC_F32 && + (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) { + if (tryToFoldAsFMAAKorMK()) + return true; + } + // Check the case where we might introduce a second constant operand to a // scalar instruction if (TII->isSALU(MI->getOpcode())) { @@ -458,7 +541,8 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Otherwise check for another constant for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) { auto &Op = MI->getOperand(i); - if (OpNo != i && !Op.isReg() && !TII->isInlineConstant(Op, OpInfo)) + if (OpNo != i && !Op.isReg() && + !TII->isInlineConstant(Op, InstDesc.operands()[i])) return false; } } @@ -516,13 +600,10 @@ bool SIFoldOperands::tryToFoldACImm( if (UseOpIdx >= Desc.getNumOperands()) return false; - uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || - OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) && - (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST || - OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST)) + if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx)) return false; + uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); @@ -671,24 +752,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); if (!DestReg.isPhysical()) { - if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { - SmallVector<FoldCandidate, 4> CopyUses; - for (auto &Use : MRI->use_nodbg_operands(DestReg)) { - // There's no point trying to fold into an implicit operand. - if (Use.isImplicit()) - continue; - - CopyUses.emplace_back(Use.getParent(), - Use.getParent()->getOperandNo(&Use), - &UseMI->getOperand(1)); - } - - for (auto &F : CopyUses) { - foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, - CopiesToReplace); - } - } - if (DestRC == &AMDGPU::AGPR_32RegClass && TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); @@ -1035,6 +1098,9 @@ SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const { // selection. // TODO: See if a frame index with a fixed offset can fold. bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const { + if (!MI->allImplicitDefsAreDead()) + return false; + unsigned Opc = MI->getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); @@ -1340,6 +1406,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { case AMDGPU::V_MAX_F32_e64: case AMDGPU::V_MAX_F16_e64: case AMDGPU::V_MAX_F16_t16_e64: + case AMDGPU::V_MAX_F16_fake16_e64: case AMDGPU::V_MAX_F64_e64: case AMDGPU::V_PK_MAX_F16: { if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) @@ -1435,7 +1502,8 @@ static int getOModValue(unsigned Opc, int64_t Val) { } } case AMDGPU::V_MUL_F16_e64: - case AMDGPU::V_MUL_F16_t16_e64: { + case AMDGPU::V_MUL_F16_t16_e64: + case AMDGPU::V_MUL_F16_fake16_e64: { switch (static_cast<uint16_t>(Val)) { case 0x3800: // 0.5 return SIOutMods::DIV2; @@ -1462,12 +1530,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_MUL_F64_e64: case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_t16_e64: + case AMDGPU::V_MUL_F16_fake16_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 || - Op == AMDGPU::V_MUL_F16_t16_e64) && + Op == AMDGPU::V_MUL_F16_t16_e64 || + Op == AMDGPU::V_MUL_F16_fake16_e64) && MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) return std::pair(nullptr, SIOutMods::NONE); @@ -1497,12 +1567,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_ADD_F64_e64: case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: - case AMDGPU::V_ADD_F16_t16_e64: { + case AMDGPU::V_ADD_F16_t16_e64: + case AMDGPU::V_ADD_F16_fake16_e64: { // If output denormals are enabled, omod is ignored. if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) || ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 || - Op == AMDGPU::V_ADD_F16_t16_e64) && + Op == AMDGPU::V_ADD_F16_t16_e64 || + Op == AMDGPU::V_ADD_F16_fake16_e64) && MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign)) return std::pair(nullptr, SIOutMods::NONE); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 903e726c667d..0f89df144486 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -11,7 +11,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Target/TargetMachine.h" @@ -26,13 +26,17 @@ static cl::opt<bool> EnableSpillVGPRToAGPR( cl::ReallyHidden, cl::init(true)); -// Find a register matching \p RC from \p LiveRegs which is unused and available -// throughout the function. On failure, returns AMDGPU::NoRegister. +// Find a register matching \p RC from \p LiveUnits which is unused and +// available throughout the function. On failure, returns AMDGPU::NoRegister. +// TODO: Rewrite the loop here to iterate over MCRegUnits instead of +// MCRegisters. This should reduce the number of iterations and avoid redundant +// checking. static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, - const LivePhysRegs &LiveRegs, + const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC) { for (MCRegister Reg : RC) { - if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) && + !MRI.isReserved(Reg)) return Reg; } return MCRegister(); @@ -42,22 +46,21 @@ static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, // callee-save registers since they may appear to be free when this is called // from canUseAsPrologue (during shrink wrapping), but then no longer be free // when this is called from emitPrologue. -static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC, - bool Unused = false) { +static MCRegister findScratchNonCalleeSaveRegister( + MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, + const TargetRegisterClass &RC, bool Unused = false) { // Mark callee saved registers as used so we will not choose them. const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); + LiveUnits.addReg(CSRegs[i]); // We are looking for a register that can be used throughout the entire // function, so any use is unacceptable. if (Unused) - return findUnusedRegister(MRI, LiveRegs, RC); + return findUnusedRegister(MRI, LiveUnits, RC); for (MCRegister Reg : RC) { - if (LiveRegs.available(MRI, Reg)) + if (LiveUnits.available(Reg) && !MRI.isReserved(Reg)) return Reg; } @@ -65,9 +68,9 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, } /// Query target location for spilling SGPRs -/// \p IncludeScratchCopy : Also look for free scratch SGPRs +/// \p IncludeScratchCopy : Also look for free scratch SGPRs static void getVGPRSpillLaneOrTempRegister( - MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, + MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy = true) { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -81,11 +84,11 @@ static void getVGPRSpillLaneOrTempRegister( // We need to save and restore the given SGPR. Register ScratchSGPR; - // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. For certain - // cases we skip copy to scratch SGPR. + // 1: Try to save the given register into an unused scratch SGPR. The + // LiveUnits should have all the callee saved registers marked as used. For + // certain cases we skip copy to scratch SGPR. if (IncludeScratchCopy) - ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -99,10 +102,10 @@ static void getVGPRSpillLaneOrTempRegister( SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); - LLVM_DEBUG( - auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); - dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); + LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); } else { // Remove dead <FI> index MF.getFrameInfo().RemoveStackObject(FI); @@ -118,7 +121,7 @@ static void getVGPRSpillLaneOrTempRegister( MFI->addToPrologEpilogSGPRSpills( SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); - LiveRegs.addReg(ScratchSGPR); + LiveUnits.addReg(ScratchSGPR); LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " << printReg(ScratchSGPR, TRI) << '\n'); } @@ -129,7 +132,7 @@ static void getVGPRSpillLaneOrTempRegister( // use. static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, - LivePhysRegs &LiveRegs, MachineFunction &MF, + LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, @@ -142,18 +145,18 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); - LiveRegs.addReg(SpillReg); + LiveUnits.addReg(SpillReg); bool IsKill = !MBB.isLiveIn(SpillReg); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, - DwordOff, MMO, nullptr, &LiveRegs); + DwordOff, MMO, nullptr, &LiveUnits); if (IsKill) - LiveRegs.removeReg(SpillReg); + LiveUnits.removeReg(SpillReg); } static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, - LivePhysRegs &LiveRegs, MachineFunction &MF, + LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, @@ -167,7 +170,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST, PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, - DwordOff, MMO, nullptr, &LiveRegs); + DwordOff, MMO, nullptr, &LiveUnits); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -195,18 +198,18 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addReg(GitPtrLo); } -static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo *FuncInfo, - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, bool IsProlog) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); +static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { + if (LiveUnits.empty()) { + LiveUnits.init(TRI); if (IsProlog) { - LiveRegs.addLiveIns(MBB); + LiveUnits.addLiveIns(MBB); } else { // In epilog. - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); + LiveUnits.addLiveOuts(MBB); + LiveUnits.stepBackward(*MBBI); } } } @@ -228,7 +231,7 @@ class PrologEpilogSGPRSpillBuilder { const SIRegisterInfo &TRI; Register SuperReg; const PrologEpilogSGPRSaveRestoreInfo SI; - LivePhysRegs &LiveRegs; + LiveRegUnits &LiveUnits; const DebugLoc &DL; Register FrameReg; ArrayRef<int16_t> SplitParts; @@ -239,10 +242,10 @@ class PrologEpilogSGPRSpillBuilder { MachineRegisterInfo &MRI = MF.getRegInfo(); assert(!MFI.isDeadObjectIndex(FI)); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + MRI, LiveUnits, AMDGPU::VGPR_32RegClass); if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); @@ -253,7 +256,7 @@ class PrologEpilogSGPRSpillBuilder { BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg); - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, + buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, FI, FrameReg, DwordOff); DwordOff += 4; } @@ -264,14 +267,15 @@ class PrologEpilogSGPRSpillBuilder { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { Register SubReg = NumSubRegs == 1 ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), + Spill[I].VGPR) .addReg(SubReg) .addImm(Spill[I].Lane) .addReg(Spill[I].VGPR, RegState::Undef); @@ -287,9 +291,9 @@ class PrologEpilogSGPRSpillBuilder { void restoreFromMemory(const int FI) { MachineRegisterInfo &MRI = MF.getRegInfo(); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + MRI, LiveUnits, AMDGPU::VGPR_32RegClass); if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); @@ -298,8 +302,8 @@ class PrologEpilogSGPRSpillBuilder { ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, - FI, FrameReg, DwordOff); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, + TmpVGPR, FI, FrameReg, DwordOff); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; @@ -309,14 +313,14 @@ class PrologEpilogSGPRSpillBuilder { void restoreFromVGPRLane(const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { Register SubReg = NumSubRegs == 1 ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(Spill[I].VGPR) .addImm(Spill[I].Lane); } @@ -335,11 +339,12 @@ public: MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, - LivePhysRegs &LiveRegs, Register FrameReg) + LiveRegUnits &LiveUnits, Register FrameReg) : MI(MI), MBB(MBB), MF(*MBB.getParent()), ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), - SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) { + SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), + FrameReg(FrameReg) { const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); SplitParts = TRI.getRegSplitParts(RC, EltSize); NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); @@ -396,9 +401,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( if (ST.isAmdPalOS()) { // Extract the scratch offset from the descriptor in the GIT - LivePhysRegs LiveRegs; - LiveRegs.init(*TRI); - LiveRegs.addLiveIns(MBB); + LiveRegUnits LiveUnits; + LiveUnits.init(*TRI); + LiveUnits.addLiveIns(MBB); // Find unused reg to load flat scratch init into MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -409,8 +414,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); for (MCPhysReg Reg : AllSGPR64s) { - if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { + if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && + MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { FlatScrInit = Reg; break; } @@ -692,7 +697,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } bool NeedsFlatScratchInit = - MFI->hasFlatScratchInit() && + MFI->getUserSGPRInfo().hasFlatScratchInit() && (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); @@ -775,7 +780,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasImplicitBufferPtr()) { + if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -814,7 +819,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( BuildMI(MBB, I, DL, SMovB32, Rsrc1) .addExternalSymbol("SCRATCH_RSRC_DWORD1") .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - } BuildMI(MBB, I, DL, SMovB32, Rsrc2) @@ -873,7 +877,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { // Activate only the inactive lanes when \p EnableInactiveLanes is true. // Otherwise, activate all lanes. It returns the saved exec. -static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, +static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -886,14 +890,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI = TII->getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + MRI, LiveUnits, *TRI.getWaveMaskRegClass()); if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(ScratchExecCopy); + LiveUnits.addReg(ScratchExecCopy); const unsigned SaveExecOpc = ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 @@ -909,7 +913,7 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, void SIFrameLowering::emitCSRSpillStores( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -924,7 +928,7 @@ void SIFrameLowering::emitCSRSpillStores( FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); if (!WWMScratchRegs.empty()) ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ true); auto StoreWWMRegisters = @@ -932,7 +936,7 @@ void SIFrameLowering::emitCSRSpillStores( for (const auto &Reg : WWMRegs) { Register VGPR = Reg.first; int FI = Reg.second; - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, VGPR, FI, FrameReg); } }; @@ -943,7 +947,7 @@ void SIFrameLowering::emitCSRSpillStores( unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ false); } @@ -955,7 +959,7 @@ void SIFrameLowering::emitCSRSpillStores( unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); - LiveRegs.addReg(ScratchExecCopy); + LiveUnits.addReg(ScratchExecCopy); } Register FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -971,7 +975,7 @@ void SIFrameLowering::emitCSRSpillStores( continue; PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, - LiveRegs, FrameReg); + LiveUnits, FrameReg); SB.save(); } @@ -986,16 +990,16 @@ void SIFrameLowering::emitCSRSpillStores( MBB.sortUniqueLiveIns(); } - if (!LiveRegs.empty()) { + if (!LiveUnits.empty()) { for (MCPhysReg Reg : ScratchSGPRs) - LiveRegs.addReg(Reg); + LiveUnits.addReg(Reg); } } } void SIFrameLowering::emitCSRSpillRestores( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const { const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -1015,7 +1019,7 @@ void SIFrameLowering::emitCSRSpillRestores( continue; PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, - LiveRegs, FrameReg); + LiveUnits, FrameReg); SB.restore(); } @@ -1027,7 +1031,7 @@ void SIFrameLowering::emitCSRSpillRestores( FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); if (!WWMScratchRegs.empty()) ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ false, /*EnableInactiveLanes*/ true); auto RestoreWWMRegisters = @@ -1035,7 +1039,7 @@ void SIFrameLowering::emitCSRSpillRestores( for (const auto &Reg : WWMRegs) { Register VGPR = Reg.first; int FI = Reg.second; - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, VGPR, FI, FrameReg); } }; @@ -1046,7 +1050,7 @@ void SIFrameLowering::emitCSRSpillRestores( unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ false, /*EnableInactiveLanes*/ false); } @@ -1079,13 +1083,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Register FramePtrReg = FuncInfo->getFrameOffsetReg(); Register BasePtrReg = TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - LivePhysRegs LiveRegs; + LiveRegUnits LiveUnits; MachineBasicBlock::iterator MBBI = MBB.begin(); // DebugLoc must be unknown since the first instruction with DebugLoc is used // to determine the end of the prologue. DebugLoc DL; + if (FuncInfo->isChainFunction()) { + // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but + // are free to set one up if they need it. + bool UseSP = requiresStackPointerReference(MF); + if (UseSP) { + assert(StackPtrReg != AMDGPU::SP_REG); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) + .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); + } + } + bool HasFP = false; bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); @@ -1097,14 +1113,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Register FramePtrRegScratchCopy; if (!HasFP && !hasFP(MF)) { // Emit the CSR spill stores with SP base register. - emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, + FuncInfo->isChainFunction() ? Register() : StackPtrReg, FramePtrRegScratchCopy); } else { // CSR spill stores will use FP as base register. Register SGPRForFPSaveRestoreCopy = FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); if (SGPRForFPSaveRestoreCopy) { // Copy FP to the scratch register now and emit the CFI entry. It avoids // the extra FP copy needed in the other two cases when FP is spilled to @@ -1112,18 +1129,18 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, PrologEpilogSGPRSpillBuilder SB( FramePtrReg, FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, - DL, TII, TRI, LiveRegs, FramePtrReg); + DL, TII, TRI, LiveUnits, FramePtrReg); SB.save(); - LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + LiveUnits.addReg(SGPRForFPSaveRestoreCopy); } else { // Copy FP into a new scratch register so that its previous value can be // spilled after setting up the new frame. FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); if (!FramePtrRegScratchCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(FramePtrRegScratchCopy); + LiveUnits.addReg(FramePtrRegScratchCopy); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) .addReg(FramePtrReg); } @@ -1133,9 +1150,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const unsigned Alignment = MFI.getMaxAlign().value(); RoundedSize += Alignment; - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); + if (LiveUnits.empty()) { + LiveUnits.init(TRI); + LiveUnits.addLiveIns(MBB); } // s_add_i32 s33, s32, NumBytes @@ -1158,10 +1175,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // If FP is used, emit the CSR spills with FP base register. if (HasFP) { - emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, FramePtrRegScratchCopy); if (FramePtrRegScratchCopy) - LiveRegs.removeReg(FramePtrRegScratchCopy); + LiveUnits.removeReg(FramePtrRegScratchCopy); } // If we need a base pointer, set it up here. It's whatever the value of @@ -1210,7 +1227,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - LivePhysRegs LiveRegs; + LiveRegUnits LiveUnits; // Get the insert location for the epilogue. If there were no terminators in // the block, get the last instruction. MachineBasicBlock::iterator MBBI = MBB.end(); @@ -1240,19 +1257,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP // into a new scratch register and copy to FP later when other registers are // restored from the current stack frame. - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); if (SGPRForFPSaveRestoreCopy) { - LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + LiveUnits.addReg(SGPRForFPSaveRestoreCopy); } else { FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); if (!FramePtrRegScratchCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(FramePtrRegScratchCopy); + LiveUnits.addReg(FramePtrRegScratchCopy); } - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, FramePtrRegScratchCopy); } @@ -1275,7 +1292,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, MIB.setMIFlag(MachineInstr::FrameDestroy); } else { // Insert the CSR spill restores with SP as the base register. - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg, FramePtrRegScratchCopy); } } @@ -1318,7 +1335,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); // Allocate spill slots for WWM reserved VGPRs. - if (!FuncInfo->isEntryFunction()) { + // For chain functions, we only need to do this if we have calls to + // llvm.amdgcn.cs.chain. + bool IsChainWithoutCalls = + FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); + if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { for (Register Reg : FuncInfo->getWWMReservedRegs()) { const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), @@ -1353,8 +1374,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { assert(RS != nullptr); - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); + RS->enterBasicBlockEnd(MBB); + RS->backward(std::next(MI.getIterator())); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; @@ -1472,30 +1493,30 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - LivePhysRegs LiveRegs; - LiveRegs.init(*TRI); + LiveRegUnits LiveUnits; + LiveUnits.init(*TRI); // Initially mark callee saved registers as used so we will not choose them // while looking for scratch SGPRs. const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); for (unsigned I = 0; CSRegs[I]; ++I) - LiveRegs.addReg(CSRegs[I]); + LiveUnits.addReg(CSRegs[I]); const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); if (NeedExecCopyReservedReg) { Register ReservedReg = MFI->getSGPRForEXECCopy(); assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); - Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); + Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); if (UnusedScratchReg) { // If found any unused scratch SGPR, reserve the register itself for Exec // copy and there is no need for any spills in that case. MFI->setSGPRForEXECCopy(UnusedScratchReg); - LiveRegs.addReg(UnusedScratchReg); + LiveUnits.addReg(UnusedScratchReg); } else { // Needs spill. assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && "Re-reserving spill slot for EXEC copy register"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC, /*IncludeScratchCopy=*/false); } } @@ -1516,14 +1537,14 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( Register FramePtrReg = MFI->getFrameOffsetReg(); assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && "Re-reserving spill slot for FP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg); + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg); } if (TRI->hasBasePointer(MF)) { Register BasePtrReg = TRI->getBaseRegister(); assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && "Re-reserving spill slot for BP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg); + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg); } } @@ -1531,8 +1552,15 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If this is a function with the amdgpu_cs_chain[_preserve] calling + // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then + // we don't need to save and restore anything. + if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; @@ -1551,17 +1579,17 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, // TODO: Handle this elsewhere at an early point. Walking through all MBBs // here would be a bad heuristic. A better way should be by calling // allocateWWMSpill during the regalloc pipeline whenever a physical - // register is allocated for the intended virtual registers. That will - // also help excluding the general use of WRITELANE/READLANE intrinsics - // that won't really need any such special handling. - if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32) + // register is allocated for the intended virtual registers. + if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); - else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) + else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + (MFI->isChainFunction() && + TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. assert(!ReturnMI || (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == @@ -1695,6 +1723,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); uint64_t EstStackSize = MFI.estimateStackSize(MF); uint64_t MaxOffset = EstStackSize - 1; @@ -1706,12 +1735,11 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( // rather than allocating as close to possible. This could save a lot of space // on frames with alignment requirements. if (ST.enableFlatScratch()) { - const SIInstrInfo *TII = ST.getInstrInfo(); if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) return false; } else { - if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) + if (TII->isLegalMUBUFImmOffset(MaxOffset)) return false; } @@ -1770,10 +1798,11 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - // For entry functions we can use an immediate offset in most cases, so the - // presence of calls doesn't imply we need a distinct frame pointer. + // For entry & chain functions we can use an immediate offset in most cases, + // so the presence of calls doesn't imply we need a distinct frame pointer. if (MFI.hasCalls() && - !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { + !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. @@ -1793,11 +1822,14 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { // register. We may need to initialize the stack pointer depending on the frame // properties, which logically overlaps many of the cases where an ordinary // function would require an FP. +// Also used for chain functions. While not technically entry functions, chain +// functions may need to set up a stack pointer in some situations. bool SIFrameLowering::requiresStackPointerReference( const MachineFunction &MF) const { // Callable functions always require a stack pointer reference. - assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && - "only expected to call this for entry points"); + assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || + MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && + "only expected to call this for entry points and chain functions"); const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 0060fc0be431..b3feb759ed81 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -38,11 +38,11 @@ public: bool NeedExecCopyReservedReg) const; void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, - LivePhysRegs &LiveRegs, Register FrameReg, + LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const; void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, - LivePhysRegs &LiveRegs, Register FrameReg, + LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const; bool assignCalleeSavedSpillSlots(MachineFunction &MF, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b7b90e23e895..34826809c1a6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -146,8 +148,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + if (Subtarget->useRealTrue16Insts()) { + addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); + addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); + } else { + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + } // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); @@ -158,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -219,7 +228,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -262,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, - MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, - MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, - MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -420,6 +429,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); + } else { + setOperationAction(ISD::FSQRT, MVT::f16, Custom); } if (Subtarget->hasMadMacF32Insts()) @@ -470,9 +481,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::f32, MVT::f64}, Legal); if (Subtarget->haveRoundOpsF64()) - setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal); + setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, + Legal); else - setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, + setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, MVT::f64, Custom); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -544,8 +556,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT VT : + {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -631,6 +644,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f16, Promote); AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v32i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v32i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -653,12 +676,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Custom); setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Expand); - for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}) { setOperationAction( {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, Vec16, Custom); @@ -681,10 +707,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, - MVT::v16f16, MVT::v16i16}, + MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, Custom); - for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) + for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, @@ -692,7 +718,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::SSUBSAT}, VT, Custom); - for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) // Split vector operations. setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, VT, Custom); @@ -728,7 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}, Custom); setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); @@ -736,6 +763,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); + if (Subtarget->hasPrefetch()) + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + if (Subtarget->hasIEEEMinMax()) + setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, + {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, MVT::v2i16, MVT::v2f16, MVT::i128}, @@ -753,16 +787,28 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::i8, MVT::i128}, Custom); + setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); + + // TODO: Could move this to custom lowering, could benefit from combines on + // extract of relevant bits. + setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); + + setOperationAction(ISD::MUL, MVT::i1, Promote); + setTargetDAGCombine({ISD::ADD, ISD::UADDO_CARRY, ISD::SUB, ISD::USUBO_CARRY, ISD::FADD, ISD::FSUB, + ISD::FDIV, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, + ISD::FMINIMUM, + ISD::FMAXIMUM, ISD::FMA, ISD::SMIN, ISD::SMAX, @@ -772,6 +818,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::AND, ISD::OR, ISD::XOR, + ISD::FSHR, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FCANONICALIZE, @@ -1002,12 +1049,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) return MVT::v5i32; + if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && + DL.getPointerSizeInBits(AS) == 192) + return MVT::v6i32; return AMDGPUTargetLowering::getPointerTy(DL, AS); } /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka /// v8i32 when padding is added. +/// The in-memory representation of a p9 is {p8, i32, i32}, which is +/// also v8i32 with padding. MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { - if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) + if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && + DL.getPointerSizeInBits(AS) == 160) || + (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && + DL.getPointerSizeInBits(AS) == 192)) return MVT::v8i32; return AMDGPUTargetLowering::getPointerMemTy(DL, AS); } @@ -1186,9 +1241,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1271,6 +1330,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_csub: { @@ -1284,7 +1345,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, } } -bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, + unsigned AddrSpace, + uint64_t FlatVariant) const { if (!Subtarget->hasFlatInstOffsets()) { // Flat instructions do not have offsets, and only have the register // address. @@ -1292,29 +1355,27 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { } return AM.Scale == 0 && - (AM.BaseOffs == 0 || - Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); + (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AddrSpace, FlatVariant)); } bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { if (Subtarget->hasFlatGlobalInsts()) - return AM.Scale == 0 && - (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, - SIInstrFlags::FlatGlobal)); + return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal); if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { - // Assume the we will use FLAT for all global memory accesses - // on VI. - // FIXME: This assumption is currently wrong. On VI we still use - // MUBUF instructions for the r + i addressing mode. As currently - // implemented, the MUBUF instructions only work on buffer < 4GB. - // It may be possible to support > 4GB buffers with MUBUF instructions, - // by setting the stride value in the resource descriptor which would - // increase the size limit to (stride * 4GB). However, this is risky, - // because it has never been validated. - return isLegalFlatAddressingMode(AM); + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, + SIInstrFlags::FLAT); } return isLegalMUBUFAddressingMode(AM); @@ -1330,7 +1391,8 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { // assume those use MUBUF instructions. Scratch loads / stores are currently // implemented as mubuf instructions with offen bit set, so slightly // different than the normal addr64. - if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs)) + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs)) return false; // FIXME: Since we can split immediate into soffset and immediate offset, @@ -1367,7 +1429,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) { + AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || + AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1394,11 +1457,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; - } else { + } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { // On GFX9 the offset is signed 21-bit in bytes (but must not be negative // for S_BUFFER_* instructions). if (!isInt<21>(AM.BaseOffs)) return false; + } else { + // On GFX12, all offsets are signed 24-bit in bytes. + if (!isInt<24>(AM.BaseOffs)) + return false; } if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. @@ -1411,9 +1478,13 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } if (AS == AMDGPUAS::PRIVATE_ADDRESS) - return isLegalMUBUFAddressingMode(AM); + return Subtarget->enableFlatScratch() + ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch) + : isLegalMUBUFAddressingMode(AM); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { + if (AS == AMDGPUAS::LOCAL_ADDRESS || + (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1436,7 +1507,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // computation. We don't have instructions that compute pointers with any // addressing modes, so treat them as having no offset like flat // instructions. - return isLegalFlatAddressingMode(AM); + return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, + SIInstrFlags::FLAT); } // Assume a user alias of global for unknown address spaces. @@ -1748,13 +1820,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, // We may not have the kernarg segment argument if we have no kernel // arguments. if (!InputPtrReg) - return DAG.getConstant(0, SL, PtrVT); + return DAG.getConstant(Offset, SL, PtrVT); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); - return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); + return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset)); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, @@ -2133,13 +2205,14 @@ void SITargetLowering::allocateSpecialInputSGPRs( const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); // TODO: Unify handling with private memory pointers. - if (Info.hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); @@ -2148,7 +2221,7 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (Info.hasDispatchID()) + if (UserSGPRInfo.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. @@ -2171,34 +2244,35 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { - if (Info.hasImplicitBufferPtr()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); + if (UserSGPRInfo.hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info.hasPrivateSegmentBuffer()) { + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); @@ -2207,26 +2281,100 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } - if (Info.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { + if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be +// sequential starting from the first argument. +void SITargetLowering::allocatePreloadKernArgSGPRs( + CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, + const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Function &F = MF.getFunction(); + unsigned LastExplicitArgOffset = + MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); + GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); + bool InPreloadSequence = true; + unsigned InIdx = 0; + for (auto &Arg : F.args()) { + if (!InPreloadSequence || !Arg.hasInRegAttr()) + break; + + int ArgIdx = Arg.getArgNo(); + // Don't preload non-original args or parts not in the current preload + // sequence. + if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || + (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) + break; + + for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && + (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; + InIdx++) { + assert(ArgLocs[ArgIdx].isMemLoc()); + auto &ArgLoc = ArgLocs[InIdx]; + const Align KernelArgBaseAlign = Align(16); + unsigned ArgOffset = ArgLoc.getLocMemOffset(); + Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); + unsigned NumAllocSGPRs = + alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + + // Arg is preloaded into the previous SGPR. + if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { + Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( + Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); + continue; + } + + unsigned Padding = ArgOffset - LastExplicitArgOffset; + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + // Check for free user SGPRs for preloading. + if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > + SGPRInfo.getNumFreeUserSGPRs()) { + InPreloadSequence = false; + break; + } + + // Preload this argument. + const TargetRegisterClass *RC = + TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); + SmallVectorImpl<MCRegister> *PreloadRegs = + Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); + + if (PreloadRegs->size() > 1) + RC = &AMDGPU::SGPR_32RegClass; + for (auto &Reg : *PreloadRegs) { + assert(Reg); + MF.addLiveIn(Reg, RC); + CCInfo.AllocateReg(Reg); + } + + LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; + } + } +} + +void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + // Always allocate this last since it is a synthetic preload. if (Info.hasLDSKernelId()) { Register Reg = Info.addLDSKernelId(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } - - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read - // these from the dispatch pointer. } // Allocate special input registers that are initialized per-wave. @@ -2331,7 +2479,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // Everything live out of a block is spilled with fast regalloc, so it's // almost certain that spilling will be required. - if (TM.getOptLevel() == CodeGenOpt::None) + if (TM.getOptLevel() == CodeGenOptLevel::None) HasStackObjects = true; // For now assume stack access is needed in any callee functions, so we need @@ -2477,12 +2625,14 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); if (IsGraphics) { - assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && - !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); + assert(!UserSGPRInfo.hasDispatchPtr() && + !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && + !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + (void)UserSGPRInfo; if (!Subtarget->enableFlatScratch()) - assert(!Info->hasFlatScratchInit()); + assert(!UserSGPRInfo.hasFlatScratchInit()); if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ()); @@ -2531,18 +2681,29 @@ SDValue SITargetLowering::LowerFormalArguments( Splits.append(Ins.begin(), Ins.end()); } + if (IsKernel) + analyzeFormalArgumentsCompute(CCInfo, Ins); + if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + if (IsKernel && Subtarget->hasKernargPreload() && + !Subtarget->needsKernargPreloadBackwardsCompatibility()) + allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); + + allocateLDSKernelId(CCInfo, MF, *TRI, *Info); } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } - if (IsKernel) { - analyzeFormalArgumentsCompute(CCInfo, Ins); - } else { + if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); + if (!IsGraphics && !Subtarget->enableFlatScratch()) { + CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, + AMDGPU::SGPR2, AMDGPU::SGPR3}, + 4); + } CCInfo.AnalyzeFormalArguments(Splits, AssignFn); } @@ -2587,9 +2748,81 @@ SDValue SITargetLowering::LowerFormalArguments( continue; } - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); - Chains.push_back(Arg.getValue(1)); + SDValue NewArg; + if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { + // In this case the argument is packed into the previous preload SGPR. + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + EVT IntVT = MemVT.changeTypeToInteger(); + + const SIMachineFunctionInfo *Info = + MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + Register Reg = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; + + assert(Reg); + Register VReg = MRI.getLiveInVirtReg(Reg); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); + NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, + Ins[i].Flags.isSExt(), &Ins[i]); + + NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); + } else { + const SIMachineFunctionInfo *Info = + MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + const SmallVectorImpl<MCRegister> &PreloadRegs = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; + + SDValue Copy; + if (PreloadRegs.size() == 1) { + Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + NewArg = DAG.getCopyFromReg( + Chain, DL, VReg, + EVT::getIntegerVT(*DAG.getContext(), + TRI->getRegSizeInBits(*RC))); + + } else { + // If the kernarg alignment does not match the alignment of the SGPR + // tuple RC that can accommodate this argument, it will be built up + // via copies from from the individual SGPRs that the argument was + // preloaded to. + SmallVector<SDValue, 4> Elts; + for (auto Reg : PreloadRegs) { + Register VReg = MRI.getLiveInVirtReg(Reg); + Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + Elts.push_back(Copy); + } + NewArg = + DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, + PreloadRegs.size()), + DL, Elts); + } + + SDValue CMemVT; + if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType())) + CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg); + else + CMemVT = DAG.getBitcast(MemVT, NewArg); + NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT, + Ins[i].Flags.isSExt(), &Ins[i]); + NewArg = DAG.getMergeValues({NewArg, Chain}, DL); + } + } else { + NewArg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, + Alignment, Ins[i].Flags.isSExt(), &Ins[i]); + } + Chains.push_back(NewArg.getValue(1)); auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -2599,11 +2832,11 @@ SDValue SITargetLowering::LowerFormalArguments( // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); + NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, + DAG.getValueType(MVT::i16)); } - InVals.push_back(Arg); + InVals.push_back(NewArg); continue; } else if (!IsEntryFunc && VA.isMemLoc()) { SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); @@ -3084,6 +3317,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization( const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (AMDGPU::isChainCC(CalleeCC)) + return true; + if (!mayTailCallThisCC(CalleeCC)) return false; @@ -3168,7 +3404,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + CallingConv::ID CallConv = CLI.CallConv; + bool IsChainCallConv = AMDGPU::isChainCC(CallConv); + SelectionDAG &DAG = CLI.DAG; + + TargetLowering::ArgListEntry RequestedExec; + if (IsChainCallConv) { + // The last argument should be the value that we need to put in EXEC. + // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we + // don't treat it like the rest of the arguments. + RequestedExec = CLI.Args.back(); + assert(RequestedExec.Node && "No node for EXEC"); + + if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) + return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); + + assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + + if (RequestedExec.Ty->isIntegerTy(64)) { + assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + } + + assert(CLI.Outs.back().OrigArgIndex != 2 && + "Haven't popped all the pieces of the EXEC mask"); + } + const SDLoc &DL = CLI.DL; SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; SmallVector<SDValue, 32> &OutVals = CLI.OutVals; @@ -3176,7 +3441,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; - CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; bool IsSibCall = false; bool IsThisReturn = false; @@ -3207,9 +3471,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { + if (!IsTailCall && + ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); + "site marked musttail or on llvm.amdgcn.cs.chain"); } bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; @@ -3232,7 +3497,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3258,16 +3523,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) { + if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + if (!IsSibCall || IsChainCallConv) { if (!Subtarget->enableFlatScratch()) { SmallVector<SDValue, 4> CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + RegsToPass.emplace_back(IsChainCallConv + ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, + ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); Chain = DAG.getTokenFactor(DL, CopyFromChains); } @@ -3412,6 +3681,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } + if (IsChainCallConv) + Ops.push_back(RequestedExec.Node); + // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) { @@ -3420,8 +3692,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // Add a register mask operand representing the call-preserved registers. - - auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3435,8 +3706,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MFI.setHasTailCall(); - unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ? - AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN; + unsigned OPC = AMDGPUISD::TC_RETURN; + switch (CallConv) { + case CallingConv::AMDGPU_Gfx: + OPC = AMDGPUISD::TC_RETURN_GFX; + break; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + OPC = AMDGPUISD::TC_RETURN_CHAIN; + break; + } + return DAG.getNode(OPC, DL, NodeTys, Ops); } @@ -3481,22 +3761,21 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const TargetFrameLowering *TFL = ST.getFrameLowering(); + const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); unsigned Opc = TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? ISD::ADD : ISD::SUB; SDValue ScaledSize = DAG.getNode( ISD::SHL, dl, VT, Size, - DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); Align StackAlign = TFL->getStackAlign(); Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value if (Alignment && *Alignment > StackAlign) { Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, DAG.getConstant(-(uint64_t)Alignment->value() - << ST.getWavefrontSizeLog2(), + << Subtarget->getWavefrontSizeLog2(), dl, VT)); } @@ -3520,6 +3799,111 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); } +SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i32) + return Op; // Defer to cannot select error. + + Register SP = getStackPointerRegisterToSaveRestore(); + SDLoc SL(Op); + + SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); + + // Convert from wave uniform to swizzled vector address. This should protect + // from any edge cases where the stacksave result isn't directly used with + // stackrestore. + SDValue VectorAddress = + DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); + return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); +} + +SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + assert(Op.getValueType() == MVT::i32); + + uint32_t BothRoundHwReg = + AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); + SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); + + SDValue IntrinID = + DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); + SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), + Op.getOperand(0), IntrinID, GetRoundBothImm); + + // There are two rounding modes, one for f32 and one for f64/f16. We only + // report in the standard value range if both are the same. + // + // The raw values also differ from the expected FLT_ROUNDS values. Nearest + // ties away from zero is not supported, and the other values are rotated by + // 1. + // + // If the two rounding modes are not the same, report a target defined value. + + // Mode register rounding mode fields: + // + // [1:0] Single-precision round mode. + // [3:2] Double/Half-precision round mode. + // + // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. + // + // Hardware Spec + // Toward-0 3 0 + // Nearest Even 0 1 + // +Inf 1 2 + // -Inf 2 3 + // NearestAway0 N/A 4 + // + // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit + // table we can index by the raw hardware mode. + // + // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf + + SDValue BitTable = + DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); + + SDValue Two = DAG.getConstant(2, SL, MVT::i32); + SDValue RoundModeTimesNumBits = + DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); + + // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we + // knew only one mode was demanded. + SDValue TableValue = + DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); + SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + + SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); + SDValue TableEntry = + DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); + + // There's a gap in the 4-bit encoded table and actual enum values, so offset + // if it's an extended value. + SDValue Four = DAG.getConstant(4, SL, MVT::i32); + SDValue IsStandardValue = + DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); + SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); + SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, + TableEntry, EnumOffset); + + return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); +} + +SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { + if (Op->isDivergent()) + return SDValue(); + + switch (cast<MemSDNode>(Op)->getAddressSpace()) { + case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + break; + default: + return SDValue(); + } + + return Op; +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -4217,40 +4601,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + // For targets older than GFX12, we emit a sequence of 32-bit operations. + // For GFX12, we emit s_add_u64 and s_sub_u64. const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( - MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); - - unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); + if (Subtarget->hasScalarAddSub64()) { + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; + BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) + .addReg(Src0.getReg()) + .addReg(Src1.getReg()); + } else { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); + + unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .add(Src0Sub0) + .add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .add(Src0Sub1) + .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + } MI.eraseFromParent(); return BB; } @@ -4463,8 +4858,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const SIRegisterInfo *TRI = ST.getRegisterInfo(); Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); const DebugLoc &DL = MI.getDebugLoc(); Register SrcCond = MI.getOperand(3).getReg(); @@ -4473,20 +4868,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); Register SrcCondCopy = MRI.createVirtualRegister(CondRC); + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub0) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub0) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub0) + .addImm(0) + .add(Src1Sub0) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub1) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub1) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub1) + .addImm(0) + .add(Src1Sub1) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) .addReg(DstLo) @@ -4843,7 +5260,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4866,7 +5283,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4926,10 +5343,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { "Load should return a value and a chain"); return Result; } - case ISD::FSQRT: - if (Op.getValueType() == MVT::f64) + case ISD::FSQRT: { + EVT VT = Op.getValueType(); + if (VT == MVT::f32) + return lowerFSQRTF32(Op, DAG); + if (VT == MVT::f64) return lowerFSQRTF64(Op, DAG); return SDValue(); + } case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); @@ -5027,6 +5448,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerXMUL_LOHI(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::STACKSAVE: + return LowerSTACKSAVE(Op, DAG); + case ISD::GET_ROUNDING: + return lowerGET_ROUNDING(Op, DAG); + case ISD::PREFETCH: + return lowerPREFETCH(Op, DAG); } return SDValue(); } @@ -5382,6 +5809,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; } + case ISD::FSQRT: { + if (N->getValueType(0) != MVT::f16) + break; + Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); + break; + } default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; @@ -5433,6 +5866,9 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { + if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) + return false; + // FIXME: Either avoid relying on address space here or change the default // address space for functions to avoid the explicit check. return (GV->getValueType()->isFunctionTy() || @@ -5616,7 +6052,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || + VT == MVT::v16f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5711,11 +6148,6 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return lowerTrapEndpgm(Op, DAG); - const Module *M = DAG.getMachineFunction().getFunction().getParent(); - unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); - if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) - return lowerTrapHsaQueuePtr(Op, DAG); - return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); } @@ -5873,7 +6305,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; SDValue Ptr = - DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset)); + DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not @@ -6134,7 +6566,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (VecSize == 128 || VecSize == 256) { + if (VecSize == 128 || VecSize == 256 || VecSize == 512) { SDValue Lo, Hi; EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); @@ -6147,9 +6579,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, DAG.getConstant(1, SL, MVT::i32))); - } else { - assert(VecSize == 256); - + } else if (VecSize == 256) { SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); SDValue Parts[4]; for (unsigned P = 0; P < 4; ++P) { @@ -6161,6 +6591,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Parts[0], Parts[1])); Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, Parts[2], Parts[3])); + } else { + assert(VecSize == 512); + + SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); + SDValue Parts[8]; + for (unsigned P = 0; P < 8; ++P) { + Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(P, SL, MVT::i32)); + } + + Lo = DAG.getBitcast(LoVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[0], Parts[1], Parts[2], Parts[3])); + Hi = DAG.getBitcast(HiVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[4], Parts[5],Parts[6], Parts[7])); } EVT IdxVT = Idx.getValueType(); @@ -6326,6 +6772,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } + if (VT == MVT::v32i16 || VT == MVT::v32f16) { + EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 8); + MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + + SmallVector<SDValue, 8> Parts[8]; + for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { + for (unsigned P = 0; P < 8; ++P) + Parts[P].push_back(Op.getOperand(I + P * E)); + } + SDValue Casts[8]; + for (unsigned P = 0; P < 8; ++P) { + SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); + Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + } + + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -6391,24 +6858,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. - // - // What we want here is an offset from the value returned by s_getpc - // (which is the address of the s_add_u32 instruction) to the global - // variable, but since the encoding of $symbol starts 4 bytes after the start - // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too - // small. This requires us to add 4 to the global variable offset in order to - // compute the correct address. Similarly for the s_addc_u32 instruction, the - // encoding of $symbol starts 12 bytes after the start of the s_add_u32 - // instruction. - SDValue PtrLo = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); SDValue PtrHi; - if (GAFlags == SIInstrInfo::MO_NONE) { + if (GAFlags == SIInstrInfo::MO_NONE) PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); - } else { - PtrHi = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1); - } + else + PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -6450,9 +6905,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); } + if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { + SDValue AddrLo = DAG.getTargetGlobalAddress( + GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); + AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; + + SDValue AddrHi = DAG.getTargetGlobalAddress( + GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); + AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; + + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); - else if (shouldEmitPCReloc(GV)) + + if (shouldEmitPCReloc(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, SIInstrInfo::MO_REL32); @@ -6699,6 +7167,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); + bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); @@ -6718,7 +7187,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (BaseOpcode->Atomic) { VData = Op.getOperand(2); - bool Is64Bit = VData.getValueType() == MVT::i64; + bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { SDValue VData2 = Op.getOperand(3); VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, @@ -6878,9 +7347,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. // - // Partial NSA is allowed on GFX11 where the final register is a contiguous + // Partial NSA is allowed on GFX11+ where the final register is a contiguous // set of the remaining addresses. - const unsigned NSAMaxSize = ST->getNSAMaxSize(); + const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler); const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); const bool UseNSA = ST->hasNSAEncoding() && VAddrs.size() >= ST->getNSAThreshold(MF) && @@ -6957,7 +7426,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); if (BaseOpcode->Atomic) CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization - if (CPol & ~AMDGPU::CPol::ALL) + if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12)) return Op; SmallVector<SDValue, 26> Ops; @@ -6977,7 +7446,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); if (IsGFX10Plus) Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); - Ops.push_back(Unorm); + if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) + Ops.push_back(Unorm); Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); @@ -6988,7 +7458,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) { report_fatal_error("TFE is not supported on this GPU"); } - Ops.push_back(LWE); // lwe + if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) + Ops.push_back(LWE); // lwe if (!IsGFX10Plus) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -7000,7 +7471,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (IsGFX11Plus) { + if (IsGFX12Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX11Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx11NSA : AMDGPU::MIMGEncGfx11Default, @@ -7071,7 +7545,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, }; // Widen vec3 load to vec4. - if (VT.isVector() && VT.getVectorNumElements() == 3) { + if (VT.isVector() && VT.getVectorNumElements() == 3 && + !Subtarget->hasScalarDwordx3Loads()) { EVT WidenedVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); auto WidenedOp = DAG.getMemIntrinsicNode( @@ -7317,7 +7792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - if (CPol & ~AMDGPU::CPol::ALL) + if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) + ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12)) return Op; return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); @@ -7341,9 +7818,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitRemovedIntrinsicError(DAG, DL, VT); } - case Intrinsic::amdgcn_ldexp: - return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::amdgcn_fract: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); @@ -7490,6 +7964,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// On targets not supporting constant in soffset field, turn zero to +// SGPR_NULL to avoid generating an extra s_mov with zero. +static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, + const GCNSubtarget *Subtarget) { + if (Subtarget->hasRestrictedSOffset()) + if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) { + if (SOffsetConst->isZero()) { + return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); + } + } + return SOffset; +} + SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -7498,13 +7985,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, SDValue VData = Op.getOperand(2); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain VData, // vdata Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -7531,13 +8019,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, SDValue VData = Op.getOperand(2); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain VData, // vdata Rsrc, // rsrc Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -7693,12 +8182,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(4), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(5), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -7717,12 +8207,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc Op.getOperand(3), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -7734,21 +8225,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, MemSDNode *M = cast<MemSDNode>(Op); EVT LoadVT = Op.getValueType(); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue(); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + SOffset, // soffset + Op.getOperand(6), // offset + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7764,13 +8256,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT LoadVT = Op.getValueType(); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(4), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(5), // format Op.getOperand(6), // cachepolicy, swizzled buffer @@ -7790,13 +8283,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT LoadVT = Op.getValueType(); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Rsrc, // rsrc Op.getOperand(3), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer @@ -8009,6 +8503,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -8016,7 +8511,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8031,6 +8526,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); + auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -8038,7 +8534,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Rsrc, // rsrc Op.getOperand(5), // vindex Offsets.first, // voffset - Op.getOperand(7), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(8), // cachepolicy DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8068,14 +8564,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } + const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); + const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; const bool Is64 = NodePtr.getValueType() == MVT::i64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; - const bool UseNSA = - Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); + const bool UseNSA = (Subtarget->hasNSAEncoding() && + NumVAddrs <= Subtarget->getNSAMaxSize()) || + IsGFX12Plus; const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -8083,15 +8582,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, int Opcode; if (UseNSA) { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + IsGFX12Plus ? AMDGPU::MIMGEncGfx12 + : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA : AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, NumVAddrDwords); } else { - Opcode = - AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default - : AMDGPU::MIMGEncGfx10Default, - NumVDataDwords, NumVAddrDwords); + assert(!IsGFX12Plus); + Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + IsGFX11 ? AMDGPU::MIMGEncGfx11Default + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, NumVAddrDwords); } assert(Opcode != -1); @@ -8179,8 +8679,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmax_num: { MemSDNode *M = cast<MemSDNode>(Op); SDValue Ops[] = { M->getOperand(0), // Chain @@ -8190,12 +8694,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned Opcode = 0; switch (IntrID) { case Intrinsic::amdgcn_global_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmin: { + case Intrinsic::amdgcn_global_atomic_fmin_num: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin_num: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; break; } case Intrinsic::amdgcn_global_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_global_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmax_num: { Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; break; } @@ -8206,6 +8714,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_s_get_barrier_state: { + SDValue Chain = Op->getOperand(0); + SmallVector<SDValue, 2> Ops; + unsigned Opc; + bool IsInlinableBarID = false; + int64_t BarID; + + if (isa<ConstantSDNode>(Op->getOperand(2))) { + BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); + IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID); + } + + if (IsInlinableBarID) { + Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); + Ops.push_back(K); + } else { + Opc = AMDGPU::S_GET_BARRIER_STATE_M0; + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2)); + Ops.push_back(M0Val.getValue(0)); + } + + auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -8383,13 +8916,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, Op.getOperand(0)), 0); } + + // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait + if (ST.hasSplitBarriers()) { + SDValue K = + DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); + SDValue BarSignal = + SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, + MVT::Other, K, Op.getOperand(0)), + 0); + SDValue BarWait = + SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, + BarSignal.getValue(0)), + 0); + return BarWait; + } + return SDValue(); }; case Intrinsic::amdgcn_tbuffer_store: { @@ -8429,13 +8978,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Chain, VData, // vdata Rsrc, // rsrc Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // format Op.getOperand(8), // cachepolicy, swizzled buffer @@ -8456,13 +9006,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Chain, VData, // vdata Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer @@ -8536,13 +9087,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); SDValue Ops[] = { Chain, VData, Rsrc, DAG.getConstant(0, DL, MVT::i32), // vindex Offsets.first, // voffset - Op.getOperand(5), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen @@ -8586,13 +9138,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); SDValue Ops[] = { Chain, VData, Rsrc, Op.getOperand(4), // vindex Offsets.first, // voffset - Op.getOperand(6), // soffset + SOffset, // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen @@ -8620,8 +9173,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; unsigned OpOffset = HasVIndex ? 1 : 0; SDValue VOffset = Op.getOperand(5 + OpOffset); - auto CVOffset = dyn_cast<ConstantSDNode>(VOffset); - bool HasVOffset = !CVOffset || !CVOffset->isZero(); + bool HasVOffset = !isNullConstant(VOffset); unsigned Size = Op->getConstantOperandVal(4); switch (Size) { @@ -8684,12 +9236,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto F = LoadMMO->getFlags() & ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, - Size, LoadMMO->getBaseAlign()); + LoadMMO = + MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - MachineMemOperand *StoreMMO = - MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, - sizeof(int32_t), LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = MF.getMachineMemOperand( + StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -8760,11 +9313,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, - Size, LoadMMO->getBaseAlign()); - MachineMemOperand *StoreMMO = - MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, - sizeof(int32_t), Align(4)); + LoadMMO = + MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); + MachineMemOperand *StoreMMO = MF.getMachineMemOperand( + StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), + LoadMMO->getAAInfo()); auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -8774,7 +9328,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_wakeup_barrier: { + SDValue Chain = Op->getOperand(0); + SmallVector<SDValue, 2> Ops; + SDValue BarOp = Op->getOperand(2); + unsigned Opc; + bool IsInlinableBarID = false; + int64_t BarVal; + + if (isa<ConstantSDNode>(BarOp)) { + BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue(); + IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal); + } + + if (IsInlinableBarID) { + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_init: + Opc = AMDGPU::S_BARRIER_INIT_IMM; + break; + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; + break; + } + + SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32); + Ops.push_back(K); + } else { + switch (IntrinsicID) { + default: + return SDValue(); + case Intrinsic::amdgcn_s_barrier_init: + Opc = AMDGPU::S_BARRIER_INIT_M0; + break; + case Intrinsic::amdgcn_s_barrier_join: + Opc = AMDGPU::S_BARRIER_JOIN_M0; + break; + case Intrinsic::amdgcn_s_wakeup_barrier: + Opc = AMDGPU::S_WAKEUP_BARRIER_M0; + break; + } + } + + if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { + SDValue M0Val; + // Member count will be read from M0[16:22] + M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3), + DAG.getShiftAmountConstant(16, MVT::i32, DL)); + if (!IsInlinableBarID) { + // If reference to barrier id is not an inline constant then it must be + // referenced with M0[4:0]. Perform an OR with the member count to + // include it in M0. + M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, + Op.getOperand(2), M0Val), + 0); + } + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } else if (!IsInlinableBarID) { + Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0)); + } + + auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -8794,7 +9417,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( SDValue Offset, SelectionDAG &DAG) const { SDLoc DL(Offset); - const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); SDValue N0 = Offset; ConstantSDNode *C1 = nullptr; @@ -8870,8 +9493,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, return; } } + + SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() + ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) + : DAG.getConstant(0, DL, MVT::i32); + Offsets[0] = CombinedOffset; - Offsets[1] = DAG.getConstant(0, DL, MVT::i32); + Offsets[1] = SOffsetZero; Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); } @@ -9051,7 +9679,7 @@ static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info) { // TODO: Should check if the address can definitely not access stack. if (Info.isEntryFunction()) - return Info.hasFlatScratchInit(); + return Info.getUserSGPRInfo().hasFlatScratchInit(); return true; } @@ -9129,7 +9757,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { - if (MemVT.isPow2VectorType()) + if (MemVT.isPow2VectorType() || + (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); } @@ -9145,7 +9774,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && Alignment >= Align(4) && NumElements < 32) { - if (MemVT.isPow2VectorType()) + if (MemVT.isPow2VectorType() || + (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); } @@ -9217,7 +9847,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || + VT.getSizeInBits() == 512) return splitTernaryVectorOp(Op, DAG); assert(VT.getSizeInBits() == 64); @@ -9277,11 +9908,6 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. - - // XXX - do we need afn for this or is arcp sufficent? - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); } @@ -9294,8 +9920,8 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - // For f16 require arcp only. - // For f32 require afn+arcp. + // For f16 require afn or arcp. + // For f32 require afn. if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) return SDValue(); @@ -9480,28 +10106,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const DenormalMode DenormMode = Info->getMode().FP32Denormals; - const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE(); + const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); + const bool HasDynamicDenormals = + (DenormMode.Input == DenormalMode::Dynamic) || + (DenormMode.Output == DenormalMode::Dynamic); + + SDValue SavedDenormMode; - if (!HasFP32Denormals) { + if (!PreservesDenormals) { // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV // lowering. The chain dependence is insufficient, and we need glue. We do // not need the glue variants in a strictfp function. SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Glue = DAG.getEntryNode(); + if (HasDynamicDenormals) { + SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, + DAG.getVTList(MVT::i32, MVT::Glue), + {BitField, Glue}); + SavedDenormMode = SDValue(GetReg, 0); + + Glue = DAG.getMergeValues( + {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); + } + SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); - EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue).getNode(); + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, + EnableDenormValue) + .getNode(); } else { const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); - EnableDenorm = - DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, - {EnableDenormValue, BitField, DAG.getEntryNode()}); + EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, + {EnableDenormValue, BitField, Glue}); } SDValue Ops[3] = { @@ -9531,12 +10173,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3, Flags); - if (!HasFP32Denormals) { - // FIXME: This mishandles dynamic denormal mode. We need to query the - // current mode and restore the original. - + if (!PreservesDenormals) { SDNode *DisableDenorm; - if (Subtarget->hasDenormModeInst()) { + if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = getSPDenormModeValue( FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); @@ -9544,8 +10183,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2)).getNode(); } else { + assert(HasDynamicDenormals == (bool)SavedDenormMode); const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + HasDynamicDenormals + ? SavedDenormMode + : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); DisableDenorm = DAG.getMachineNode( AMDGPU::S_SETREG_B32, SL, MVT::Other, @@ -9754,6 +10396,111 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// Avoid the full correct expansion for f32 sqrt when promoting from f16. +SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + assert(!Subtarget->has16BitInsts()); + SDNodeFlags Flags = Op->getFlags(); + SDValue Ext = + DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); + + SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); + SDValue Sqrt = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); + + return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); +} + +SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDNodeFlags Flags = Op->getFlags(); + MVT VT = Op.getValueType().getSimpleVT(); + const SDValue X = Op.getOperand(0); + + if (allowApproxFunc(DAG, Flags)) { + // Instruction is 1ulp but ignores denormals. + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); + } + + SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); + SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); + + SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); + + SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); + + SDValue SqrtX = + DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); + + SDValue SqrtS; + if (needsDenormHandlingF32(DAG, X, Flags)) { + SDValue SqrtID = + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); + SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); + + SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); + SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(-1, DL, MVT::i32)); + SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); + + SDValue NegSqrtSNextDown = + DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); + + SDValue SqrtVP = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + + SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(1, DL, MVT::i32)); + SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); + + SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); + SDValue SqrtVS = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + + SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); + SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, + Flags); + + SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, + Flags); + } else { + SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); + + SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); + + SDValue Half = DAG.getConstantFP(0.5f, DL, VT); + SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); + SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); + + SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); + SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); + + SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); + SDValue SqrtD = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); + } + + SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); + + SDValue ScaledDown = + DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); +} + SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { // For double type, the SQRT and RSQ instructions don't have required // precision, we apply Goldschmidt's algorithm to improve the result: @@ -10111,9 +10858,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } -// Returns true if argument is a boolean value which is not serialized into -// memory or argument and does not require v_cndmask_b32 to be deserialized. -static bool isBoolSGPR(SDValue V) { +bool llvm::isBoolSGPR(SDValue V) { if (V.getValueType() != MVT::i1) return false; switch (V.getOpcode()) { @@ -10427,13 +11172,34 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, if (Depth >= 6) return std::nullopt; + auto ValueSize = Op.getValueSizeInBits(); + if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32) + return std::nullopt; + switch (Op->getOpcode()) { case ISD::TRUNCATE: { - if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + SDValue NarrowOp = Op->getOperand(0); + auto NarrowVT = NarrowOp.getValueType(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); + NarrowVT = VTSign->getVT(); + } + if (!NarrowVT.isByteSized()) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); + + if (SrcIndex >= NarrowByteWidth) return std::nullopt; return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!ShiftOp) @@ -10450,9 +11216,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, } default: { - if (Op.getScalarValueSizeInBits() != 32) - return std::nullopt; - return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); } } @@ -10476,7 +11239,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, unsigned BitWidth = Op.getScalarValueSizeInBits(); if (BitWidth % 8 != 0) return std::nullopt; - assert(Index < BitWidth / 8 && "invalid index requested"); + if (Index > BitWidth / 8 - 1) + return std::nullopt; switch (Op.getOpcode()) { case ISD::OR: { @@ -10519,6 +11283,31 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::FSHR: { + // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); + if (!ShiftOp || Op.getValueType().isVector()) + return std::nullopt; + + uint64_t BitsProvided = Op.getValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); + if (BitShift % 8) + return std::nullopt; + + uint64_t ConcatSizeInBytes = BitsProvided / 4; + uint64_t ByteShift = BitShift / 8; + + uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; + uint64_t BytesProvided = BitsProvided / 8; + SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); + NewIndex %= BytesProvided; + return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); + } + + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!ShiftOp) @@ -10565,9 +11354,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: + case ISD::AssertZext: + case ISD::AssertSext: { SDValue NarrowOp = Op->getOperand(0); - unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || + Op->getOpcode() == ISD::AssertZext || + Op->getOpcode() == ISD::AssertSext) { + auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); + NarrowBitWidth = VTSign->getVT().getSizeInBits(); + } if (NarrowBitWidth % 8 != 0) return std::nullopt; uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10581,10 +11379,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } case ISD::TRUNCATE: { - unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); - if (NarrowBitWidth % 8 != 0) - return std::nullopt; - uint64_t NarrowByteWidth = NarrowBitWidth / 8; + uint64_t NarrowByteWidth = BitWidth / 8; if (NarrowByteWidth >= Index) { return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, @@ -10594,8 +11389,16 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, return std::nullopt; } + case ISD::CopyFromReg: { + if (BitWidth / 8 > Index) + return calculateSrcByte(Op, StartingIndex, Index); + + return std::nullopt; + } + case ISD::LOAD: { auto L = cast<LoadSDNode>(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); if (NarrowBitWidth % 8 != 0) return std::nullopt; @@ -10621,6 +11424,41 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, Depth + 1, StartingIndex); + + case ISD::EXTRACT_VECTOR_ELT: { + auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!IdxOp) + return std::nullopt; + auto VecIdx = IdxOp->getZExtValue(); + auto ScalarSize = Op.getScalarValueSizeInBits(); + if (ScalarSize != 32) { + if ((VecIdx + 1) * ScalarSize > 32) + return std::nullopt; + Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; + } + + return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0), + StartingIndex, Index); + } + + case AMDGPUISD::PERM: { + auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); + if (!PermMask) + return std::nullopt; + + auto IdxMask = + (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); + if (IdxMask > 0x07 && IdxMask != 0x0c) + return std::nullopt; + + auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); + auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; + + return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) + : ByteProvider<SDValue>( + ByteProvider<SDValue>::getConstantZero()); + } + default: { return std::nullopt; } @@ -10630,7 +11468,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } // Returns true if the Operand is a scalar and is 16 bits -static bool is16BitScalarOp(SDValue &Operand) { +static bool isExtendedFrom16Bits(SDValue &Operand) { + switch (Operand.getOpcode()) { case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -10646,7 +11485,7 @@ static bool is16BitScalarOp(SDValue &Operand) { auto MemVT = L->getMemoryVT(); return !MemVT.isVector() && MemVT.getSizeInBits() == 16; } - return false; + return L->getMemoryVT().getSizeInBits() == 16; } default: return false; @@ -10674,29 +11513,118 @@ static bool addresses16Bits(int Mask) { // Do not lower into v_perm if the operands are actually 16 bit // and the selected bits (based on PermMask) correspond with two // easily addressable 16 bit operands. -static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, +static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; - // ByteProvider only accepts 32 bit operands - assert(Op.getValueType().getSizeInBits() == 32); - assert(OtherOp.getValueType().getSizeInBits() == 32); + assert(Op.getValueType().isByteSized()); + assert(OtherOp.getValueType().isByteSized()); - auto OpIs16Bit = is16BitScalarOp(Op); - auto OtherOpIs16Bit = is16BitScalarOp(Op); + auto TempOp = peekThroughBitcasts(Op); + auto TempOtherOp = peekThroughBitcasts(OtherOp); - // If there is a size mismatch, then we must use masking on at least one - // operand - if (OpIs16Bit != OtherOpIs16Bit) + auto OpIs16Bit = + TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); + if (!OpIs16Bit) return true; - // If both operands are 16 bit, return whether or not we cleanly address both - if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) - return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || + isExtendedFrom16Bits(TempOtherOp); + if (!OtherOpIs16Bit) + return true; - // Both are 32 bit operands - return true; + // Do we cleanly address both + return !addresses16Bits(Low16) || !addresses16Bits(Hi16); +} + +static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + SmallVector<ByteProvider<SDValue>, 8> PermNodes; + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional<ByteProvider<SDValue>> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + std::optional<int> SecondSrc; + uint64_t PermMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc.has_value()) + if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + return SDValue(); + + // Set the index of the second distinct Src node + SecondSrc = i; + assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDValue Op = *PermNodes[FirstSrc].Src; + SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src + : *PermNodes[FirstSrc].Src; + + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + + // Check that we are not just extracting the bytes in order from an op + if (Op == OtherOp && Op.getValueSizeInBits() == 32) { + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; + + bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); + bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + + // The perm op would really just produce Op. So combine into Op + if (WellFormedLow && WellFormedHi) + return DAG.getBitcast(MVT::getIntegerVT(32), Op); + } + + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { + SDLoc DL(N); + assert(Op.getValueType().isByteSized() && + OtherOp.getValueType().isByteSized()); + + // If the ultimate src is less than 32 bits, then we will only be + // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. + // CalculateByteProvider would not have returned Op as source if we + // used a byte that is outside its ValueType. Thus, we are free to + // ANY_EXTEND as the extended bits are dont-cares. + Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); + OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, + DAG.getConstant(PermMask, DL, MVT::i32)); + } + + return SDValue(); } SDValue SITargetLowering::performOrCombine(SDNode *N, @@ -10812,69 +11740,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } if (LHSMask == ~0u || RHSMask == ~0u) { - SmallVector<ByteProvider<SDValue>, 8> PermNodes; - - // VT is known to be MVT::i32, so we need to provide 4 bytes. - assert(VT == MVT::i32); - for (int i = 0; i < 4; i++) { - // Find the ByteProvider that provides the ith byte of the result of OR - std::optional<ByteProvider<SDValue>> P = - calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); - // TODO support constantZero - if (!P || P->isConstantZero()) - return SDValue(); - - PermNodes.push_back(*P); - } - if (PermNodes.size() != 4) - return SDValue(); - - int FirstSrc = 0; - std::optional<int> SecondSrc; - uint64_t permMask = 0x00000000; - for (size_t i = 0; i < PermNodes.size(); i++) { - auto PermOp = PermNodes[i]; - // Since the mask is applied to Src1:Src2, Src1 bytes must be offset - // by sizeof(Src2) = 4 - int SrcByteAdjust = 4; - - if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { - if (SecondSrc.has_value()) - if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) - return SDValue(); - // Set the index of the second distinct Src node - SecondSrc = i; - assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == - 32); - SrcByteAdjust = 0; - } - assert(PermOp.SrcOffset + SrcByteAdjust < 8); - assert(!DAG.getDataLayout().isBigEndian()); - permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); - } - - SDValue Op = *PermNodes[FirstSrc].Src; - SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src - : *PermNodes[FirstSrc].Src; - - // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp) { - int Low16 = permMask & 0xffff; - int Hi16 = (permMask & 0xffff0000) >> 16; - - bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); - bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); - - // The perm op would really just produce Op. So combine into Op - if (WellFormedLow && WellFormedHi) - return Op; - } - - if (hasEightBitAccesses(permMask, Op, OtherOp)) { - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, - DAG.getConstant(permMask, DL, MVT::i32)); - } + if (SDValue Perm = matchPERM(N, DCI)) + return Perm; } } @@ -11021,10 +11888,8 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, SDValue Mask = N->getOperand(1); // fp_class x, 0 -> false - if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { - if (CMask->isZero()) - return DAG.getConstant(0, SDLoc(N), MVT::i1); - } + if (isNullConstant(Mask)) + return DAG.getConstant(0, SDLoc(N), MVT::i1); if (N->getOperand(0).isUndef()) return DAG.getUNDEF(MVT::i1); @@ -11049,7 +11914,9 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, N->getFlags()); } - if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. + if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && + N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0), N->getFlags()); } @@ -11131,10 +11998,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: case AMDGPUISD::CLAMP: case AMDGPUISD::FMED3: case AMDGPUISD::FMAX3: - case AMDGPUISD::FMIN3: { + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAXIMUM3: + case AMDGPUISD::FMINIMUM3: { // FIXME: Shouldn't treat the generic operations different based these. // However, we aren't really required to flush the result from // minnum/maxnum.. @@ -11288,7 +12159,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, case AMDGPU::G_FMINNUM: case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: - case AMDGPU::G_FMAXNUM_IEEE: { + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINIMUM: + case AMDGPU::G_FMAXIMUM: { if (Subtarget->supportsMinMaxDenormModes() || // FIXME: denormalsEnabledForType is broken for dynamic denormalsEnabledForType(MRI.getType(Reg), MF)) @@ -11302,7 +12175,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, return false; return true; case AMDGPU::G_INTRINSIC: - switch (MI->getIntrinsicID()) { + case AMDGPU::G_INTRINSIC_CONVERGENT: + switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fmad_ftz: case Intrinsic::amdgcn_sqrt: @@ -11321,7 +12195,6 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, case Intrinsic::amdgcn_div_fmas: case Intrinsic::amdgcn_div_fixup: case Intrinsic::amdgcn_fract: - case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_cvt_pkrtz: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: @@ -11476,6 +12349,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { case ISD::FMAXNUM: case ISD::FMAXNUM_IEEE: return AMDGPUISD::FMAX3; + case ISD::FMAXIMUM: + return AMDGPUISD::FMAXIMUM3; case ISD::SMAX: return AMDGPUISD::SMAX3; case ISD::UMAX: @@ -11483,6 +12358,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { case ISD::FMINNUM: case ISD::FMINNUM_IEEE: return AMDGPUISD::FMIN3; + case ISD::FMINIMUM: + return AMDGPUISD::FMINIMUM3; case ISD::SMIN: return AMDGPUISD::SMIN3; case ISD::UMIN: @@ -11842,7 +12719,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine( case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: - case ISD::FMINNUM_IEEE: { + case ISD::FMINNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: { SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, @@ -12203,6 +13082,256 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +// Collect the ultimate src of each of the mul node's operands, and confirm +// each operand is 8 bytes. +static std::optional<ByteProvider<SDValue>> +handleMulOperand(const SDValue &MulOperand) { + auto Byte0 = calculateByteProvider(MulOperand, 0, 0); + if (!Byte0 || Byte0->isConstantZero()) { + return std::nullopt; + } + auto Byte1 = calculateByteProvider(MulOperand, 1, 0); + if (Byte1 && !Byte1->isConstantZero()) { + return std::nullopt; + } + return Byte0; +} + +static unsigned addPermMasks(unsigned First, unsigned Second) { + unsigned FirstCs = First & 0x0c0c0c0c; + unsigned SecondCs = Second & 0x0c0c0c0c; + unsigned FirstNoCs = First & ~0x0c0c0c0c; + unsigned SecondNoCs = Second & ~0x0c0c0c0c; + + assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); + assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); + assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); + assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); + + return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); +} + +static void placeSources(ByteProvider<SDValue> &Src0, + ByteProvider<SDValue> &Src1, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s, + int Step) { + + assert(Src0.Src.has_value() && Src1.Src.has_value()); + // Src0s and Src1s are empty, just place arbitrarily. + if (Step == 0) { + Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c}); + Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c}); + return; + } + + for (int BPI = 0; BPI < 2; BPI++) { + std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; + if (BPI == 1) { + BPP = {Src1, Src0}; + } + unsigned ZeroMask = 0x0c0c0c0c; + unsigned FMask = 0xFF << (8 * (3 - Step)); + + unsigned FirstMask = + BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + unsigned SecondMask = + BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + // Attempt to find Src vector which contains our SDValue, if so, add our + // perm mask to the existing one. If we are unable to find a match for the + // first SDValue, attempt to find match for the second. + int FirstGroup = -1; + for (int I = 0; I < 2; I++) { + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = + I == 0 ? Src0s : Src1s; + auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) { + return IterElt.first == *BPP.first.Src; + }; + + auto Match = llvm::find_if(Srcs, MatchesFirst); + if (Match != Srcs.end()) { + Match->second = addPermMasks(FirstMask, Match->second); + FirstGroup = I; + break; + } + } + if (FirstGroup != -1) { + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = + FirstGroup == 1 ? Src0s : Src1s; + auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) { + return IterElt.first == *BPP.second.Src; + }; + auto Match = llvm::find_if(Srcs, MatchesSecond); + if (Match != Srcs.end()) { + Match->second = addPermMasks(SecondMask, Match->second); + } else + Srcs.push_back({*BPP.second.Src, SecondMask}); + return; + } + } + + // If we have made it here, then we could not find a match in Src0s or Src1s + // for either Src0 or Src1, so just place them arbitrarily. + + unsigned ZeroMask = 0x0c0c0c0c; + unsigned FMask = 0xFF << (8 * (3 - Step)); + + Src0s.push_back( + {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + Src1s.push_back( + {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + + return; +} + +static SDValue +resolveSources(SelectionDAG &DAG, SDLoc SL, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, + bool IsSigned, bool IsAny) { + + // If we just have one source, just permute it accordingly. + if (Srcs.size() == 1) { + auto Elt = Srcs.begin(); + auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32); + + // v_perm will produce the original value. + if (Elt->second == 0x3020100) + return EltVal; + + return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, + DAG.getConstant(Elt->second, SL, MVT::i32)); + } + + auto FirstElt = Srcs.begin(); + auto SecondElt = std::next(FirstElt); + + SmallVector<SDValue, 2> Perms; + + // If we have multiple sources in the chain, combine them via perms (using + // calculated perm mask) and Ors. + while (true) { + auto FirstMask = FirstElt->second; + auto SecondMask = SecondElt->second; + + unsigned FirstCs = FirstMask & 0x0c0c0c0c; + unsigned FirstPlusFour = FirstMask | 0x04040404; + // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any + // original 0x0C. + FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; + + auto PermMask = addPermMasks(FirstMask, SecondMask); + auto FirstVal = + DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + auto SecondVal = + DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32); + + Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, + SecondVal, + DAG.getConstant(PermMask, SL, MVT::i32))); + + FirstElt = std::next(SecondElt); + if (FirstElt == Srcs.end()) + break; + + SecondElt = std::next(FirstElt); + // If we only have a FirstElt, then just combine that into the cumulative + // source node. + if (SecondElt == Srcs.end()) { + auto EltVal = + DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + + Perms.push_back( + DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, + DAG.getConstant(FirstElt->second, SL, MVT::i32))); + break; + } + } + + assert(Perms.size() == 1 || Perms.size() == 2); + return Perms.size() == 2 + ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) + : Perms[0]; +} + +static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, + unsigned ChainLength) { + for (auto &[EntryVal, EntryMask] : Srcs) { + EntryMask = EntryMask >> ((4 - ChainLength) * 8); + auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; + EntryMask += ZeroMask; + } +} + +static bool isMul(const SDValue Op) { + auto Opcode = Op.getOpcode(); + + return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || + Opcode == AMDGPUISD::MUL_I24); +} + +static std::optional<bool> +checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, + ByteProvider<SDValue> &Src1, const SDValue &S0Op, + const SDValue &S1Op, const SelectionDAG &DAG) { + // If we both ops are i8s (pre legalize-dag), then the signedness semantics + // of the dot4 is irrelevant. + if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) + return false; + + auto Known0 = DAG.computeKnownBits(S0Op, 0); + bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; + bool S0IsSigned = Known0.countMinLeadingOnes() > 0; + auto Known1 = DAG.computeKnownBits(S1Op, 0); + bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; + bool S1IsSigned = Known1.countMinLeadingOnes() > 0; + + assert(!(S0IsUnsigned && S0IsSigned)); + assert(!(S1IsUnsigned && S1IsSigned)); + + // There are 9 possible permutations of + // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} + + // In two permutations, the sign bits are known to be the same for both Ops, + // so simply return Signed / Unsigned corresponding to the MSB + + if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) + return S0IsSigned; + + // In another two permutations, the sign bits are known to be opposite. In + // this case return std::nullopt to indicate a bad match. + + if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) + return std::nullopt; + + // In the remaining five permutations, we don't know the value of the sign + // bit for at least one Op. Since we have a valid ByteProvider, we know that + // the upper bits must be extension bits. Thus, the only ways for the sign + // bit to be unknown is if it was sign extended from unknown value, or if it + // was any extended. In either case, it is correct to use the signed + // version of the signedness semantics of dot4 + + // In two of such permutations, we known the sign bit is set for + // one op, and the other is unknown. It is okay to used signed version of + // dot4. + if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || + ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) + return true; + + // In one such permutation, we don't know either of the sign bits. It is okay + // to used the signed version of dot4. + if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) + return true; + + // In two of such permutations, we known the sign bit is unset for + // one op, and the other is unknown. Return std::nullopt to indicate a + // bad match. + if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || + ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) + return std::nullopt; + + llvm_unreachable("Fully covered condition"); +} + SDValue SITargetLowering::performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12216,14 +13345,146 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, if (SDValue Folded = tryFoldToMad64_32(N, DCI)) return Folded; } - - return SDValue(); } if (SDValue V = reassociateScalarOps(N, DAG)) { return V; } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && + (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { + SDValue TempNode(N, 0); + std::optional<bool> IsSigned; + SmallVector<std::pair<SDValue, unsigned>, 4> Src0s; + SmallVector<std::pair<SDValue, unsigned>, 4> Src1s; + SmallVector<SDValue, 4> Src2s; + + // Match the v_dot4 tree, while collecting src nodes. + int ChainLength = 0; + for (int I = 0; I < 4; I++) { + auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; + if (MulIdx == -1) + break; + auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); + if (!Src0) + break; + auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); + if (!Src1) + break; + + auto IterIsSigned = checkDot4MulSignedness( + TempNode->getOperand(MulIdx), *Src0, *Src1, + TempNode->getOperand(MulIdx)->getOperand(0), + TempNode->getOperand(MulIdx)->getOperand(1), DAG); + if (!IterIsSigned) + break; + if (!IsSigned) + IsSigned = *IterIsSigned; + if (*IterIsSigned != *IsSigned) + break; + placeSources(*Src0, *Src1, Src0s, Src1s, I); + auto AddIdx = 1 - MulIdx; + // Allow the special case where add (add (mul24, 0), mul24) became -> + // add (mul24, mul24). + if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { + Src2s.push_back(TempNode->getOperand(AddIdx)); + auto Src0 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); + if (!Src0) + break; + auto Src1 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); + if (!Src1) + break; + auto IterIsSigned = checkDot4MulSignedness( + TempNode->getOperand(AddIdx), *Src0, *Src1, + TempNode->getOperand(AddIdx)->getOperand(0), + TempNode->getOperand(AddIdx)->getOperand(1), DAG); + if (!IterIsSigned) + break; + assert(IsSigned); + if (*IterIsSigned != *IsSigned) + break; + placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); + Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); + ChainLength = I + 2; + break; + } + + TempNode = TempNode->getOperand(AddIdx); + Src2s.push_back(TempNode); + ChainLength = I + 1; + if (TempNode->getNumOperands() < 2) + break; + LHS = TempNode->getOperand(0); + RHS = TempNode->getOperand(1); + } + + if (ChainLength < 2) + return SDValue(); + + // Masks were constructed with assumption that we would find a chain of + // length 4. If not, then we need to 0 out the MSB bits (via perm mask of + // 0x0c) so they do not affect dot calculation. + if (ChainLength < 4) { + fixMasks(Src0s, ChainLength); + fixMasks(Src1s, ChainLength); + } + + SDValue Src0, Src1; + + // If we are just using a single source for both, and have permuted the + // bytes consistently, we can just use the sources without permuting + // (commutation). + bool UseOriginalSrc = false; + if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && + Src0s.begin()->second == Src1s.begin()->second && + Src0s.begin()->first.getValueSizeInBits() == 32 && + Src1s.begin()->first.getValueSizeInBits() == 32) { + SmallVector<unsigned, 4> SrcBytes; + auto Src0Mask = Src0s.begin()->second; + SrcBytes.push_back(Src0Mask & 0xFF000000); + bool UniqueEntries = true; + for (auto I = 1; I < 4; I++) { + auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); + + if (is_contained(SrcBytes, NextByte)) { + UniqueEntries = false; + break; + } + SrcBytes.push_back(NextByte); + } + + if (UniqueEntries) { + UseOriginalSrc = true; + // Must be 32 bits to enter above conditional. + assert(Src0s.begin()->first.getValueSizeInBits() == 32); + assert(Src1s.begin()->first.getValueSizeInBits() == 32); + Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first); + Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first); + } + } + + if (!UseOriginalSrc) { + Src0 = resolveSources(DAG, SL, Src0s, false, true); + Src1 = resolveSources(DAG, SL, Src1s, false, true); + } + + assert(IsSigned); + SDValue Src2 = + DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); + + SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 + : Intrinsic::amdgcn_udot4, + SL, MVT::i64); + + assert(!VT.isVector()); + auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, + Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); + + return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); + } + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -12295,8 +13556,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, if (LHS.getOpcode() == ISD::USUBO_CARRY) { // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc - auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - if (!C || !C->isZero()) + if (!isNullConstant(LHS.getOperand(1))) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); @@ -12417,6 +13677,41 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performFDivCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + if (VT != MVT::f16 || !Subtarget->has16BitInsts()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + SDNodeFlags Flags = N->getFlags(); + SDNodeFlags RHSFlags = RHS->getFlags(); + if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || + !RHS->hasOneUse()) + return SDValue(); + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || + (IsNegative = CLHS->isExactlyValue(-1.0))) { + // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 + // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 + if (RHS.getOpcode() == ISD::FSQRT) { + // TODO: Or in RHS flags, somehow missing from SDNodeFlags + SDValue Rsq = + DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); + return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; + } + } + } + + return SDValue(); +} + SDValue SITargetLowering::performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12666,7 +13961,7 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) return SDValue(); switch (N->getOpcode()) { case ISD::ADD: @@ -12680,12 +13975,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFAddCombine(N, DCI); case ISD::FSUB: return performFSubCombine(N, DCI); + case ISD::FDIV: + return performFDivCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: case ISD::FMINNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: @@ -12699,6 +13998,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::FSHR: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (N->getValueType(0) == MVT::i32 && N->isDivergent() && + TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { + return matchPERM(N, DCI); + } + break; + } case ISD::XOR: return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: @@ -12793,7 +14100,7 @@ static unsigned SubIdx2Lane(unsigned Idx) { } } -/// Adjust the writemask of MIMG instructions +/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { unsigned Opcode = Node->getMachineOpcode(); @@ -12811,7 +14118,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || - Node->getConstantOperandVal(LWEIdx)) + (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx))) ? true : false; unsigned TFCLane = 0; @@ -12943,7 +14250,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, continue; } else { SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + if (NewUser != User) { + DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); + DAG.RemoveDeadNode(User); + } } switch (Idx) { @@ -13019,7 +14330,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && + if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && !TII->isGather4(Opcode) && AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) { return adjustWritemask(Node, DAG); @@ -13106,7 +14417,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const { return; unsigned TFEVal = TFE ? TFE->getImm() : 0; - unsigned LWEVal = LWE->getImm(); + unsigned LWEVal = LWE ? LWE->getImm() : 0; unsigned D16Val = D16 ? D16->getImm() : 0; if (!TFEVal && !LWEVal) @@ -13183,7 +14494,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineFunction *MF = MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. @@ -13194,11 +14507,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // use between vgpr and agpr as agpr tuples tend to be big. if (!MI.getDesc().operands().empty()) { unsigned Opc = MI.getOpcode(); + bool HasAGPRs = Info->mayNeedAGPRs(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + for (auto I : + {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { if (I == -1) break; + if ((I == Src2Idx) && (HasAGPRs)) + break; MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !Op.getReg().isVirtual()) continue; @@ -13216,6 +14534,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MRI.setRegClass(Op.getReg(), NewRC); } + if (!HasAGPRs) + return; + // Resolve the rest of AV operands to AGPRs. if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { if (Src2->isReg() && Src2->getReg().isVirtual()) { @@ -13233,7 +14554,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, return; } - if (TII->isMIMG(MI)) { + if (TII->isImage(MI)) { if (!MI.mayStore()) AddIMGInit(MI); TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); @@ -13377,7 +14698,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::pair(0U, RC); } - if (Constraint.startswith("{") && Constraint.endswith("}")) { + if (Constraint.starts_with("{") && Constraint.ends_with("}")) { StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); if (RegName.consume_front("v")) { RC = &AMDGPU::VGPR_32RegClass; @@ -13467,7 +14788,7 @@ static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { } void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, - std::string &Constraint, + StringRef Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { if (isImmConstraint(Constraint)) { @@ -13516,8 +14837,7 @@ bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { return false; } -bool SITargetLowering::checkAsmConstraintVal(SDValue Op, - const std::string &Constraint, +bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const { if (Constraint.size() == 1) { switch (Constraint[0]) { @@ -13735,8 +15055,9 @@ void SITargetLowering::computeKnownBitsForTargetInstr( const MachineRegisterInfo &MRI, unsigned Depth) const { const MachineInstr *MI = MRI.getVRegDef(R); switch (MI->getOpcode()) { - case AMDGPU::G_INTRINSIC: { - switch (MI->getIntrinsicID()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { case Intrinsic::amdgcn_workitem_id_x: knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); break; @@ -13801,21 +15122,16 @@ Align SITargetLowering::computeKnownAlignForTargetInstr( GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, unsigned Depth) const { const MachineInstr *MI = MRI.getVRegDef(R); - switch (MI->getOpcode()) { - case AMDGPU::G_INTRINSIC: - case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + if (auto *GI = dyn_cast<GIntrinsic>(MI)) { // FIXME: Can this move to generic code? What about the case where the call // site specifies a lower alignment? - Intrinsic::ID IID = MI->getIntrinsicID(); + Intrinsic::ID IID = GI->getIntrinsicID(); LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); if (MaybeAlign RetAlign = Attrs.getRetAlignment()) return *RetAlign; - return Align(1); - } - default: - return Align(1); } + return Align(1); } Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1745c0b9e88e..5bc091d6e84d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -109,6 +109,8 @@ private: SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -212,13 +214,15 @@ private: SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; - bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace, + uint64_t FlatVariant) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; @@ -409,6 +413,10 @@ public: SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; @@ -463,13 +471,11 @@ public: getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; ConstraintType getConstraintType(StringRef Constraint) const override; - void LowerAsmOperandForConstraint(SDValue Op, - std::string &Constraint, + void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const; - bool checkAsmConstraintVal(SDValue Op, - const std::string &Constraint, + bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const; bool checkAsmConstraintValA(SDValue Op, uint64_t Val, @@ -543,6 +549,17 @@ public: const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + void allocatePreloadKernArgSGPRs(CCState &CCInfo, + SmallVectorImpl<CCValAssign> &ArgLocs, + const SmallVectorImpl<ISD::InputArg> &Ins, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, @@ -572,6 +589,10 @@ public: getTargetMMOFlags(const Instruction &I) const override; }; +// Returns true if argument is a boolean value which is not serialized into +// memory or argument and does not require v_cndmask_b32 to be deserialized. +bool isBoolSGPR(SDValue V); + } // End namespace llvm #endif diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index 50f8ad4433c6..442ae4dd7b34 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -232,7 +232,10 @@ public: // scheduler it limits the size of the cluster to avoid increasing // register pressure too much, but this pass runs after register // allocation so there is no need for that kind of limit. - !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) { + // We also lie about the Offset and OffsetIsScalable parameters, + // as they aren't used in the SIInstrInfo implementation. + !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false, + 2, 2)))) { // Finish the current clause. Changed |= emitClause(CI, SII); CI = ClauseInfo(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index a74b917f82bf..8415a3d77d3b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -292,6 +292,11 @@ public: VgprVmemTypes[GprNo] = 0; } + void setNonKernelFunctionInitialState() { + setScoreUB(VS_CNT, getWaitCountMax(VS_CNT)); + PendingEvents |= WaitEventMaskForInst[VS_CNT]; + } + void print(raw_ostream &); void dump() { print(dbgs()); } @@ -364,7 +369,6 @@ private: const MachineRegisterInfo *MRI = nullptr; AMDGPU::IsaVersion IV; - DenseSet<MachineInstr *> TrackedWaitcntSet; DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; MachineLoopInfo *MLI; @@ -452,7 +456,9 @@ public: // FLAT instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); - if (!ST->hasVscnt()) + // LDS DMA loads are also stores, but on the LDS side. On the VMEM side + // these should use VM_CNT. + if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst)) return VMEM_ACCESS; if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) { // FLAT and SCRATCH instructions may access scratch. Other VMEM @@ -486,6 +492,9 @@ public: MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const; + + // Transform a soft waitcnt into a normal one. + bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; }; } // end anonymous namespace @@ -505,7 +514,8 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); + unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & + AMDGPU::HWEncoding::REG_IDX_MASK; if (TRI->isVectorRegister(*MRI, Op.getReg())) { assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); @@ -543,14 +553,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, } } -// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written -// can be accessed. A load from LDS to VMEM does not need a wait. -static bool mayWriteLDSThroughDMA(const MachineInstr &MI) { - return SIInstrInfo::isVALU(MI) && - (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) && - MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; -} - void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, @@ -590,12 +592,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, AMDGPU::OpName::data1), CurrScore); } - } else if (SIInstrInfo::isAtomicRet(Inst) && - Inst.getOpcode() != AMDGPU::DS_GWS_INIT && - Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V && - Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR && - Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P && - Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER && + } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && Inst.getOpcode() != AMDGPU::DS_APPEND && Inst.getOpcode() != AMDGPU::DS_CONSUME && Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { @@ -683,7 +680,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); } #endif - } else { + } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { // Match the score to the destination registers. for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { auto &Op = Inst.getOperand(I); @@ -694,6 +691,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, if (Interval.first >= NUM_ALL_VGPRS) continue; if (updateVMCntOnly(Inst)) { + // updateVMCntOnly should only leave us with VGPRs + // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR + // defs. That's required for a sane index into `VgprMemTypes` below + assert(TRI->isVectorRegister(*MRI, Op.getReg())); VmemType V = getVmemType(Inst); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) VgprVmemTypes[RegNo] |= 1 << V; @@ -703,7 +704,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(RegNo, T, CurrScore); } } - if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) { + if (Inst.mayStore() && + (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { + // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS + // written can be accessed. A load from LDS to VMEM does not need a wait. setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); } } @@ -870,6 +874,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, return true; } +bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { + unsigned Opcode = Waitcnt->getOpcode(); + if (!SIInstrInfo::isSoftWaitcnt(Opcode)) + return false; + + Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode))); + return true; +} + /// Combine consecutive waitcnt instructions that precede \p It and follow /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added /// by previous passes. Currently this pass conservatively assumes that these @@ -886,86 +899,77 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt( if (II.isMetaInstruction()) continue; - if (II.getOpcode() == AMDGPU::S_WAITCNT) { - // Conservatively update required wait if this waitcnt was added in an - // earlier pass. In this case it will not exist in the tracked waitcnt - // set. - if (!TrackedWaitcntSet.count(&II)) { - unsigned IEnc = II.getOperand(0).getImm(); - AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); - Wait = Wait.combined(OldWait); - } + unsigned Opcode = II.getOpcode(); + bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode); + + if (SIInstrInfo::isWaitcnt(Opcode)) { + // Update required wait count. If this is a soft waitcnt (= it was added + // by an earlier pass), it may be entirely removed. + unsigned IEnc = II.getOperand(0).getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait); + Wait = Wait.combined(OldWait); // Merge consecutive waitcnt of the same type by erasing multiples. - if (!WaitcntInstr) { - WaitcntInstr = &II; - } else { + if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; - } + } else + WaitcntInstr = &II; } else { - assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(SIInstrInfo::isWaitcntVsCnt(Opcode)); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); - if (!TrackedWaitcntSet.count(&II)) { - unsigned OldVSCnt = - TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); - Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); - } - if (!WaitcntVsCntInstr) { - WaitcntVsCntInstr = &II; - } else { + unsigned OldVSCnt = + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); + if (IsSoft) + ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt); + Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); + + if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) { II.eraseFromParent(); Modified = true; - } + } else + WaitcntVsCntInstr = &II; } } // Updated encoding of merged waitcnt with the required wait. if (WaitcntInstr) { - if (Wait.hasWaitExceptVsCnt()) { - Modified |= - updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, - AMDGPU::encodeWaitcnt(IV, Wait)); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VmCnt = ~0u; - Wait.LgkmCnt = ~0u; - Wait.ExpCnt = ~0u; - - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() - ? dbgs() << "applyPreexistingWaitcnt\n" - << "New Instr at block end: " << *WaitcntInstr - << '\n' - : dbgs() << "applyPreexistingWaitcnt\n" - << "Old Instr: " << *It - << "New Instr: " << *WaitcntInstr << '\n'); + Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, + AMDGPU::encodeWaitcnt(IV, Wait)); + Modified |= promoteSoftWaitCnt(WaitcntInstr); - } else { - WaitcntInstr->eraseFromParent(); - Modified = true; - } + ScoreBrackets.applyWaitcnt(Wait); + Wait.VmCnt = ~0u; + Wait.LgkmCnt = ~0u; + Wait.ExpCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() + << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitcntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntInstr << '\n'); } if (WaitcntVsCntInstr) { - if (Wait.hasWaitVsCnt()) { - assert(ST->hasVscnt()); - Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, - AMDGPU::OpName::simm16, Wait.VsCnt); - ScoreBrackets.applyWaitcnt(Wait); - Wait.VsCnt = ~0u; - - LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() - ? dbgs() << "applyPreexistingWaitcnt\n" - << "New Instr at block end: " - << *WaitcntVsCntInstr << '\n' - : dbgs() << "applyPreexistingWaitcnt\n" - << "Old Instr: " << *It - << "New Instr: " << *WaitcntVsCntInstr << '\n'); - } else { - WaitcntVsCntInstr->eraseFromParent(); - Modified = true; - } + Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, + AMDGPU::OpName::simm16, Wait.VsCnt); + Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr); + ScoreBrackets.applyWaitcnt(Wait); + Wait.VsCnt = ~0u; + + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitcntVsCntInstr + << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntVsCntInstr << '\n'); } return Modified; @@ -1178,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) continue; // No need to wait before load from VMEM to LDS. - if (mayWriteLDSThroughDMA(MI)) + if (TII->mayWriteLDSThroughDMA(MI)) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. @@ -1315,9 +1319,8 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, // instruction was modified to handle the required wait. if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - auto SWaitInst = + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; @@ -1328,10 +1331,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); - auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) .addImm(Wait.VsCnt); - TrackedWaitcntSet.insert(SWaitInst); Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; @@ -1504,6 +1506,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, break; case AMDGPU::S_MEMTIME: case AMDGPU::S_MEMREALTIME: + case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: + case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: + case AMDGPU::S_BARRIER_LEAVE: + case AMDGPU::S_GET_BARRIER_STATE_M0: + case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); break; } @@ -1574,9 +1581,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } static bool isWaitInstr(MachineInstr &Inst) { - return Inst.getOpcode() == AMDGPU::S_WAITCNT || - (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && - Inst.getOperand(0).isReg() && + auto Opcode = Inst.getOpcode(); + return SIInstrInfo::isWaitcnt(Opcode) || + (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() && Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL); } @@ -1721,26 +1728,25 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // which we want to flush the vmcnt counter, and false otherwise. bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets) { - if (PreheadersToFlush.count(&MBB)) - return PreheadersToFlush[&MBB]; - - auto UpdateCache = [&](bool val) { - PreheadersToFlush[&MBB] = val; - return val; - }; + auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); + if (!IsInserted) + return Iterator->second; MachineBasicBlock *Succ = MBB.getSingleSuccessor(); if (!Succ) - return UpdateCache(false); + return false; MachineLoop *Loop = MLI->getLoopFor(Succ); if (!Loop) - return UpdateCache(false); + return false; - if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets)) - return UpdateCache(true); + if (Loop->getLoopPreheader() == &MBB && + shouldFlushVmCnt(Loop, ScoreBrackets)) { + Iterator->second = true; + return true; + } - return UpdateCache(false); + return false; } bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { @@ -1825,7 +1831,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ForceEmitWaitcnt[T] = false; OptNone = MF.getFunction().hasOptNone() || - MF.getTarget().getOptLevel() == CodeGenOpt::None; + MF.getTarget().getOptLevel() == CodeGenOptLevel::None; HardwareLimits Limits = {}; Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1839,12 +1845,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); RegisterEncoding Encoding = {}; - Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); + Encoding.VGPR0 = + TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; - Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); + Encoding.SGPR0 = + TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; - TrackedWaitcntSet.clear(); BlockInfos.clear(); bool Modified = false; @@ -1862,6 +1869,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ; BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + auto NonKernelInitialState = + std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); + NonKernelInitialState->setNonKernelFunctionInitialState(); + BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); + Modified = true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td index f674777724eb..585a3eb78618 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -40,6 +40,8 @@ class InstSI <dag outs, dag ins, string asm = "", field bit MTBUF = 0; field bit SMRD = 0; field bit MIMG = 0; + field bit VIMAGE = 0; + field bit VSAMPLE = 0; field bit EXP = 0; field bit FLAT = 0; field bit DS = 0; @@ -156,6 +158,9 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that the instruction is never-uniform/divergent field bit IsNeverUniform = 0; + // ds_gws_* instructions. + field bit GWS = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -181,15 +186,17 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{18} = MTBUF; let TSFlags{19} = SMRD; let TSFlags{20} = MIMG; - let TSFlags{21} = EXP; - let TSFlags{22} = FLAT; - let TSFlags{23} = DS; + let TSFlags{21} = VIMAGE; + let TSFlags{22} = VSAMPLE; + let TSFlags{23} = EXP; + let TSFlags{24} = FLAT; + let TSFlags{25} = DS; - let TSFlags{24} = VGPRSpill; - let TSFlags{25} = SGPRSpill; + let TSFlags{26} = VGPRSpill; + let TSFlags{27} = SGPRSpill; - let TSFlags{26} = LDSDIR; - let TSFlags{27} = VINTERP; + let TSFlags{28} = LDSDIR; + let TSFlags{29} = VINTERP; let TSFlags{32} = VM_CNT; let TSFlags{33} = EXP_CNT; @@ -239,6 +246,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{61} = IsNeverUniform; + let TSFlags{62} = GWS; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; @@ -299,6 +308,16 @@ def CPolBit { class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; +def VOPDstOperand_t16 : VOPDstOperand <VGPR_16> { + let EncoderMethod = "getMachineOpValueT16"; + let DecoderMethod = "DecodeVGPR_16RegisterClass"; +} + +def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> { + let EncoderMethod = "getMachineOpValueT16Lo128"; + let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass"; +} + class VINTRPe <bits<2> op> : Enc32 { bits<8> vdst; bits<8> vsrc; @@ -414,6 +433,57 @@ class MIMGe_gfx11 <bits<8> op> : Enc64 { let Inst{62-58} = ssamp{6-2}; } +class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 { + bits<3> dim; + bits<1> tfe; + bits<1> r128; + bit d16; + bits<1> a16; + bits<4> dmask; + bits<8> vdata; + bits<9> rsrc; + bits<6> cpol; + bits<8> vaddr0; + bits<8> vaddr1; + bits<8> vaddr2; + bits<8> vaddr3; + + let Inst{2-0} = dim; + let Inst{4} = r128; + let Inst{5} = d16; + let Inst{6} = a16; + let Inst{21-14} = op; + let Inst{25-22} = dmask; + let Inst{39-32} = vdata; + let Inst{49-41} = rsrc; + let Inst{51-50} = cpol{4-3}; // scope + let Inst{54-52} = cpol{2-0}; // th + let Inst{71-64} = vaddr0; + let Inst{79-72} = vaddr1; + let Inst{87-80} = vaddr2; + let Inst{95-88} = vaddr3; +} + +class VSAMPLEe <bits<8> op> : VIMAGE_VSAMPLE_Common<op> { + bits<1> unorm; + bits<1> lwe; + bits<9> samp; + + let Inst{3} = tfe; + let Inst{13} = unorm; + let Inst{31-26} = 0x39; + let Inst{40} = lwe; + let Inst{63-55} = samp; +} + +class VIMAGEe <bits<8> op> : VIMAGE_VSAMPLE_Common<op> { + bits<8> vaddr4; + + let Inst{31-26} = 0x34; + let Inst{55} = tfe; + let Inst{63-56} = vaddr4; +} + class EXPe : Enc64 { bits<4> en; bits<6> tgt; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0f954732a5ee..70ef1fff274a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -17,7 +17,9 @@ #include "GCNHazardRecognizer.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" @@ -105,9 +107,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } +static bool canRemat(const MachineInstr &MI) { + + if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) || + SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) || + SIInstrInfo::isSALU(MI)) + return true; + + if (SIInstrInfo::isSMRD(MI)) { + return !MI.memoperands_empty() && + llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) { + return MMO->isLoad() && MMO->isInvariant(); + }); + } + + return false; +} + bool SIInstrInfo::isReallyTriviallyReMaterializable( const MachineInstr &MI) const { - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { + + if (canRemat(MI)) { // Normally VALU use of exec would block the rematerialization, but that // is OK in this case to have an implicit exec read as all VALU do. // We really want all of the generic logic for this except for this. @@ -119,12 +139,13 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable( // There is difference to generic method which does not allow // rematerialization if there are virtual register uses. We allow this, // therefore this method includes SOP instructions as well. - return !MI.hasImplicitDef() && - MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && - !MI.mayRaiseFPException(); + if (!MI.hasImplicitDef() && + MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && + !MI.mayRaiseFPException()) + return true; } - return false; + return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } // Returns true if the scalar result of a VALU instruction depends on exec. @@ -169,6 +190,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); } +bool SIInstrInfo::isSafeToSink(MachineInstr &MI, + MachineBasicBlock *SuccToSinkTo, + MachineCycleInfo *CI) const { + // Allow sinking if MI edits lane mask (divergent i1 in sgpr). + if (MI.getOpcode() == AMDGPU::SI_IF_BREAK) + return true; + + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + // Check if sinking of MI would create temporal divergent use. + for (auto Op : MI.uses()) { + if (Op.isReg() && Op.getReg().isVirtual() && + RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) { + MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg()); + + // SgprDef defined inside cycle + MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent()); + if (FromCycle == nullptr) + continue; + + MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo); + // Check if there is a FromCycle that contains SgprDef's basic block but + // does not contain SuccToSinkTo and also has divergent exit condition. + while (FromCycle && !FromCycle->contains(ToCycle)) { + // After structurize-cfg, there should be exactly one cycle exit. + SmallVector<MachineBasicBlock *, 1> ExitBlocks; + FromCycle->getExitBlocks(ExitBlocks); + assert(ExitBlocks.size() == 1); + assert(ExitBlocks[0]->getSinglePredecessor()); + + // FromCycle has divergent exit condition. + if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) { + return false; + } + + FromCycle = FromCycle->getParentCycle(); + } + } + } + + return true; +} + bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const { @@ -421,6 +484,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( Offset = OffsetOp ? OffsetOp->getImm() : 0; // Get appropriate operand, and compute width accordingly. DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); + if (DataOpIdx == -1) + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } @@ -479,8 +544,10 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, } bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + int64_t Offset1, bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, - unsigned NumLoads, + int64_t Offset2, bool OffsetIsScalable2, + unsigned ClusterSize, unsigned NumBytes) const { // If the mem ops (to be clustered) do not have the same base ptr, then they // should not be clustered @@ -506,8 +573,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops // (5) LoadSize >= 17: do not cluster - const unsigned LoadSize = NumBytes / NumLoads; - const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; + const unsigned LoadSize = NumBytes / ClusterSize; + const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize; return NumDWORDs <= 8; } @@ -619,7 +686,7 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, } RS.enterBasicBlockEnd(MBB); - RS.backward(MI); + RS.backward(std::next(MI)); // Ideally we want to have three registers for a long reg_sequence copy // to hide 2 waitstates between v_mov_b32 and accvgpr_write. @@ -680,23 +747,27 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { int16_t SubIdx = BaseIndices[Idx]; - Register Reg = RI.getSubReg(DestReg, SubIdx); + Register DestSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); unsigned Opcode = AMDGPU::S_MOV_B32; // Is SGPR aligned? If so try to combine with next. - Register Src = RI.getSubReg(SrcReg, SubIdx); - bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; - bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; + bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0; + bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0; if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { // Can use SGPR64 copy unsigned Channel = RI.getChannelFromSubReg(SubIdx); SubIdx = RI.getSubRegFromChannel(Channel, 2); + DestSubReg = RI.getSubReg(DestReg, SubIdx); + SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); Opcode = AMDGPU::S_MOV_B64; Idx++; } - LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)) + LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) + .addReg(SrcSubReg) .addReg(SrcReg, RegState::Implicit); if (!FirstMI) @@ -722,24 +793,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); - - // FIXME: This is hack to resolve copies between 16 bit and 32 bit - // registers until all patterns are fixed. - if (Fix16BitCopies && - ((RI.getRegSizeInBits(*RC) == 16) ^ - (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) { - MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; - MCRegister Super = RI.get32BitRegister(RegToFix); - assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); - RegToFix = Super; - - if (DestReg == SrcReg) { - // Insert empty bundle since ExpandPostRA expects an instruction here. - BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); - return; + unsigned Size = RI.getRegSizeInBits(*RC); + const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); + unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); + + // The rest of copyPhysReg assumes Src and Dst size are the same size. + // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can + // we remove Fix16BitCopies and this code block? + if (Fix16BitCopies) { + if (((Size == 16) != (SrcSize == 16))) { + // Non-VGPR Src and Dst will later be expanded back to 32 bits. + assert(ST.hasTrue16BitInsts()); + MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg; + MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16); + RegToFix = SubReg; + + if (DestReg == SrcReg) { + // Identity copy. Insert empty bundle since ExpandPostRA expects an + // instruction here. + BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); + return; + } + RC = RI.getPhysRegBaseClass(DestReg); + Size = RI.getRegSizeInBits(*RC); + SrcRC = RI.getPhysRegBaseClass(SrcReg); + SrcSize = RI.getRegSizeInBits(*SrcRC); } - - RC = RI.getPhysRegBaseClass(DestReg); } if (RC == &AMDGPU::VGPR_32RegClass) { @@ -863,10 +942,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const unsigned Size = RI.getRegSizeInBits(*RC); if (Size == 16) { - assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || - AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || + assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); @@ -904,6 +981,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (ST.hasTrue16BitInsts()) { + if (IsSGPRSrc) { + assert(SrcLow); + SrcReg = NewSrcReg; + } + // Use the smaller instruction encoding if possible. + if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) && + (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg) + .addReg(SrcReg); + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg) + .addImm(0) // src0_modifiers + .addReg(SrcReg) + .addImm(0); // op_sel + } + return; + } + if (IsSGPRSrc && !ST.hasSDWAScalar()) { if (!DstLow || !SrcLow) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, @@ -930,14 +1026,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { if (ST.hasMovB64()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } - if (ST.hasPackedFP32Ops()) { + if (ST.hasPkMovB32()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) .addImm(SISrcMods::OP_SEL_1) .addReg(SrcReg) @@ -984,7 +1079,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (ST.hasMovB64()) { Opcode = AMDGPU::V_MOV_B64_e32; EltSize = 8; - } else if (ST.hasPackedFP32Ops()) { + } else if (ST.hasPkMovB32()) { Opcode = AMDGPU::V_PK_MOV_B32; EltSize = 8; } @@ -1012,6 +1107,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, SubIdx = SubIndices[Idx]; else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + Register DestSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); bool IsFirstSubreg = Idx == 0; bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; @@ -1019,30 +1117,26 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); Register ImpUseSuper = SrcReg; - indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), - RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap, - ImpDefSuper, ImpUseSuper); + indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill, + *RS, Overlap, ImpDefSuper, ImpUseSuper); } else if (Opcode == AMDGPU::V_PK_MOV_B32) { - Register DstSubReg = RI.getSubReg(DestReg, SubIdx); - Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); MachineInstrBuilder MIB = - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) - .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); if (IsFirstSubreg) MIB.addReg(DestReg, RegState::Define | RegState::Implicit); } else { MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)); + BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg); if (IsFirstSubreg) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); @@ -1286,7 +1380,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) return AMDGPU::COPY; - if (RI.getRegSizeInBits(*DstRC) == 32) { + if (RI.getRegSizeInBits(*DstRC) == 16) { + // Assume hi bits are unneeded. Only _e64 true16 instructions are legal + // before RA. + return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64; + } else if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; @@ -1587,11 +1685,15 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { +static unsigned getWWMRegSpillSaveOpcode(unsigned Size, + bool IsVectorSuperClass) { // Currently, there is only 32-bit WWM register spills needed. if (Size != 4) llvm_unreachable("unknown wwm register spill size"); + if (IsVectorSuperClass) + return AMDGPU::SI_SPILL_WWM_AV32_SAVE; + return AMDGPU::SI_SPILL_WWM_V32_SAVE; } @@ -1600,11 +1702,13 @@ static unsigned getVectorRegSpillSaveOpcode(Register Reg, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI) { + bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); + // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillSaveOpcode(Size); + return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (TRI.isVectorSuperClass(RC)) + if (IsVectorSuperClass) return getAVSpillSaveOpcode(Size); return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) @@ -1807,11 +1911,15 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { +static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, + bool IsVectorSuperClass) { // Currently, there is only 32-bit WWM register spills needed. if (Size != 4) llvm_unreachable("unknown wwm register spill size"); + if (IsVectorSuperClass) + return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; + return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } @@ -1819,11 +1927,13 @@ static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI) { + bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); + // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillRestoreOpcode(Size); + return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (TRI.isVectorSuperClass(RC)) + if (IsVectorSuperClass) return getAVSpillRestoreOpcode(Size); return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) @@ -2006,6 +2116,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); break; + case AMDGPU::SI_SPILL_S32_TO_VGPR: + MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); + break; + + case AMDGPU::SI_RESTORE_S32_FROM_VGPR: + MI.setDesc(get(AMDGPU::V_READLANE_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2024,7 +2142,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { APInt Imm(64, SrcOp.getImm()); APInt Lo(32, Imm.getLoBits(32).getZExtValue()); APInt Hi(32, Imm.getHiBits(32).getZExtValue()); - if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { + if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) .addImm(SISrcMods::OP_SEL_1) .addImm(Lo.getSExtValue()) @@ -2045,7 +2163,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } } else { assert(SrcOp.isReg()); - if (ST.hasPackedFP32Ops() && + if (ST.hasPkMovB32() && !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) .addImm(SISrcMods::OP_SEL_1) // src0_mod @@ -2275,23 +2393,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Register Reg = MI.getOperand(0).getReg(); Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + MachineOperand OpLo = MI.getOperand(1); + MachineOperand OpHi = MI.getOperand(2); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. MIBundleBuilder Bundler(MBB, MI); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); - // Add 32-bit offset from this instruction to the start of the - // constant data. - Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .add(MI.getOperand(1))); + // What we want here is an offset from the value returned by s_getpc (which + // is the address of the s_add_u32 instruction) to the global variable, but + // since the encoding of $symbol starts 4 bytes after the start of the + // s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order + // to compute the correct address. Similarly for the s_addc_u32 instruction, + // the encoding of $symbol starts 12 bytes after the start of the s_add_u32 + // instruction. + + if (OpLo.isGlobal()) + OpLo.setOffset(OpLo.getOffset() + 4); + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo)); + + if (OpHi.isGlobal()) + OpHi.setOffset(OpHi.getOffset() + 12); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .add(OpHi)); - MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi); - MIB.add(MI.getOperand(2)); - - Bundler.append(MIB); finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); @@ -2350,12 +2479,98 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } +void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, Register DestReg, + unsigned SubIdx, const MachineInstr &Orig, + const TargetRegisterInfo &RI) const { + + // Try shrinking the instruction to remat only the part needed for current + // context. + // TODO: Handle more cases. + unsigned Opcode = Orig.getOpcode(); + switch (Opcode) { + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: { + if (SubIdx != 0) + break; + + if (I == MBB.end()) + break; + + if (I->isBundled()) + break; + + // Look for a single use of the register that is also a subreg. + Register RegToFind = Orig.getOperand(0).getReg(); + MachineOperand *UseMO = nullptr; + for (auto &CandMO : I->operands()) { + if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef()) + continue; + if (UseMO) { + UseMO = nullptr; + break; + } + UseMO = &CandMO; + } + if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister) + break; + + unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg()); + unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg()); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet."); + + unsigned NewOpcode = -1; + if (SubregSize == 256) + NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM; + else if (SubregSize == 128) + NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM; + else + break; + + const MCInstrDesc &TID = get(NewOpcode); + const TargetRegisterClass *NewRC = + RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF)); + MRI.setRegClass(DestReg, NewRC); + + UseMO->setReg(DestReg); + UseMO->setSubReg(AMDGPU::NoSubRegister); + + // Use a smaller load with the desired size, possibly with updated offset. + MachineInstr *MI = MF->CloneMachineInstr(&Orig); + MI->setDesc(TID); + MI->getOperand(0).setReg(DestReg); + MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister); + if (Offset) { + MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset); + int64_t FinalOffset = OffsetMO->getImm() + Offset / 8; + OffsetMO->setImm(FinalOffset); + } + SmallVector<MachineMemOperand *> NewMMOs; + for (const MachineMemOperand *MemOp : Orig.memoperands()) + NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(), + SubregSize / 8)); + MI->setMemRefs(*MF, NewMMOs); + + MBB.insert(I, MI); + return; + } + + default: + break; + } + + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); +} + std::pair<MachineInstr*, MachineInstr*> SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); if (ST.hasMovB64() && - AMDGPU::isLegal64BitDPPControl( + AMDGPU::isLegalDPALU_DPPControl( getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); return std::pair(&MI, nullptr); @@ -2414,6 +2629,14 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { return std::pair(Split[0], Split[1]); } +std::optional<DestSourcePair> +SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::WWM_COPY) + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; + + return std::nullopt; +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -2474,6 +2697,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, if (CommutedOpcode == -1) return nullptr; + if (Src0Idx > Src1Idx) + std::swap(Src0Idx, Src1Idx); + assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == @@ -2556,14 +2782,8 @@ bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, return isIntN(BranchOffsetBits, BrOffset); } -MachineBasicBlock *SIInstrInfo::getBranchDestBlock( - const MachineInstr &MI) const { - if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { - // This would be a difficult analysis to perform, but can always be legal so - // there's no need to analyze it. - return nullptr; - } - +MachineBasicBlock * +SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { return MI.getOperand(0).getMBB(); } @@ -2874,7 +3094,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); if (!FBB) { - Cond[1].isUndef(); MachineInstr *CondBr = BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); @@ -3079,7 +3298,9 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { case AMDGPU::V_MOV_B64_e64: case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: case AMDGPU::COPY: + case AMDGPU::WWM_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: @@ -3111,11 +3332,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, switch (DefMI.getOpcode()) { default: return false; + case AMDGPU::V_MOV_B64_e32: case AMDGPU::S_MOV_B64: - // TODO: We could fold 64-bit immediates, but this get complicated - // when there are sub-registers. - return false; - + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: @@ -3128,19 +3348,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!ImmOp->isImm()) return false; + auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t { + int64_t Imm = ImmOp->getImm(); + switch (UseOp.getSubReg()) { + default: + return Imm; + case AMDGPU::sub0: + return Lo_32(Imm); + case AMDGPU::sub1: + return Hi_32(Imm); + case AMDGPU::lo16: + return APInt(16, Imm).getSExtValue(); + case AMDGPU::hi16: + return APInt(32, Imm).ashr(16).getSExtValue(); + case AMDGPU::sub1_lo16: + return APInt(16, Hi_32(Imm)).getSExtValue(); + case AMDGPU::sub1_hi16: + return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue(); + } + }; + + assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { + assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form"); + Register DstReg = UseMI.getOperand(0).getReg(); - bool Is16Bit = getOpSize(UseMI, 0) == 2; + unsigned OpSize = getOpSize(UseMI, 0); + bool Is16Bit = OpSize == 2; + bool Is64Bit = OpSize == 8; bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); - unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - APInt Imm(32, ImmOp->getImm()); - - if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) - Imm = Imm.ashr(16); + unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32 + : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO + : AMDGPU::S_MOV_B32; + APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1))); if (RI.isAGPR(*MRI, DstReg)) { - if (!isInlineConstant(Imm)) + if (Is64Bit || !isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; } @@ -3200,14 +3446,32 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_{f16, f32}. - // We should only expect these to be on src0 due to canonicalization. - if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) + if ((Src0->isReg() && Src0->getReg() == Reg) || + (Src1->isReg() && Src1->getReg() == Reg)) { + MachineOperand *RegSrc = + Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1; + if (!RegSrc->isReg()) + return false; + if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) && + ST.getConstantBusLimit(Opc) < 2) return false; if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + // If src2 is also a literal constant then we have to choose which one to + // fold. In general it is better to choose madak so that the other literal + // can be materialized in an sgpr instead of a vgpr: + // s_mov_b32 s0, literal + // v_madak_f32 v0, s0, v0, literal + // Instead of: + // v_mov_b32 v1, literal + // v_madmk_f32 v0, v0, literal, v1 + MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg()); + if (Def && Def->isMoveImmediate() && + !isInlineConstant(Def->getOperand(1))) + return false; + unsigned NewOpc = IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 @@ -3216,18 +3480,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // We need to swap operands 0 and 1 since madmk constant is at operand 1. + // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // would also require restricting their register classes. For now + // just bail out. + if (NewOpc == AMDGPU::V_FMAMK_F16_t16) + return false; - const int64_t Imm = ImmOp->getImm(); + const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. - Register Src1Reg = Src1->getReg(); - unsigned Src1SubReg = Src1->getSubReg(); - Src0->setReg(Src1Reg); - Src0->setSubReg(Src1SubReg); - Src0->setIsKill(Src1->isKill()); + Register SrcReg = RegSrc->getReg(); + unsigned SrcSubReg = RegSrc->getSubReg(); + Src0->setReg(SrcReg); + Src0->setSubReg(SrcSubReg); + Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || @@ -3249,43 +3517,38 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { - // Not allowed to use constant bus for another operand. - // We can however allow an inline immediate as src0. - bool Src0Inlined = false; - if (Src0->isReg()) { - // Try to inline constant if possible. - // If the Def moves immediate and the use is single - // We are saving VGPR here. - MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); - if (Def && Def->isMoveImmediate() && - isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src0->getReg())) { - Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - Src0Inlined = true; - } else if ((Src0->getReg().isPhysical() && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) || - (Src0->getReg().isVirtual() && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) - return false; + if (ST.getConstantBusLimit(Opc) < 2) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + bool Src0Inlined = false; + if (Src0->isReg()) { + // Try to inline constant if possible. + // If the Def moves immediate and the use is single + // We are saving VGPR here. + MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src0->getReg())) { + Src0->ChangeToImmediate(Def->getOperand(1).getImm()); + Src0Inlined = true; + } else if (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRReg(*MRI, Src0->getReg())) { + return false; + } // VGPR is okay as Src0 - fallthrough - } + } - if (Src1->isReg() && !Src0Inlined ) { - // We have one slot for inlinable constant so far - try to fill it - MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); - if (Def && Def->isMoveImmediate() && - isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src1->getReg()) && - commuteInstruction(UseMI)) { + if (Src1->isReg() && !Src0Inlined) { + // We have one slot for inlinable constant so far - try to fill it + MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - } else if ((Src1->getReg().isPhysical() && - RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) || - (Src1->getReg().isVirtual() && - RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; + else if (RI.isSGPRReg(*MRI, Src1->getReg())) + return false; // VGPR is okay as Src1 - fallthrough + } } unsigned NewOpc = @@ -3296,7 +3559,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - const int64_t Imm = ImmOp->getImm(); + // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // would also require restricting their register classes. For now + // just bail out. + if (NewOpc == AMDGPU::V_FMAAK_F16_t16) + return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3308,7 +3575,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); // ChangingToImmediate adds Src2 back to the instruction. - Src2->ChangeToImmediate(Imm); + Src2->ChangeToImmediate(getImmFor(*Src2)); // These come before src2. removeModOperands(UseMI); @@ -3403,19 +3670,30 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isSMRD(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isSMRD(MIb); } if (isSMRD(MIa)) { if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isMUBUF(MIb) && !isMTBUF(MIb); } if (isFLAT(MIa)) { - if (isFLAT(MIb)) + if (isFLAT(MIb)) { + if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) || + (isFLATGlobal(MIa) && isFLATScratch(MIb))) + return true; + return checkInstOffsetsDoNotOverlap(MIa, MIb); + } return false; } @@ -3722,13 +4000,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, } bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || - Opcode == AMDGPU::DS_GWS_INIT || - Opcode == AMDGPU::DS_GWS_SEMA_V || - Opcode == AMDGPU::DS_GWS_SEMA_BR || - Opcode == AMDGPU::DS_GWS_SEMA_P || - Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || - Opcode == AMDGPU::DS_GWS_BARRIER; + return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); } bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { @@ -3773,7 +4045,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // However, executing them with EXEC = 0 causes them to operate on undefined // data, which we avoid by returning true here. if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || - Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) + Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 || + Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR || + Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR) return true; return false; @@ -3827,9 +4101,7 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { assert(!MO.isReg() && "isInlineConstant called on register operand!"); - if (!MO.isImm() || - OperandType < AMDGPU::OPERAND_SRC_FIRST || - OperandType > AMDGPU::OPERAND_SRC_LAST) + if (!MO.isImm()) return false; // MachineOperand provides no way to tell the true operand size, since it only @@ -3849,7 +4121,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: - case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: { int32_t Trunc = static_cast<int32_t>(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -3877,12 +4150,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - // This suffers the same problem as the scalar 16-bit cases. - return AMDGPU::isInlinableIntLiteralV216(Imm); + return (isInt<16>(Imm) || isUInt<16>(Imm)) && + AMDGPU::isInlinableIntLiteral((int16_t)Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -3895,17 +4171,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint32_t Trunc = static_cast<uint32_t>(Imm); - return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); - } case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return false; + case AMDGPU::OPERAND_INPUT_MODS: + case MCOI::OPERAND_IMMEDIATE: + // Always embedded in the instruction for free. + return true; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_REGISTER: + case MCOI::OPERAND_PCREL: + case MCOI::OPERAND_GENERIC_0: + case MCOI::OPERAND_GENERIC_1: + case MCOI::OPERAND_GENERIC_2: + case MCOI::OPERAND_GENERIC_3: + case MCOI::OPERAND_GENERIC_4: + case MCOI::OPERAND_GENERIC_5: + // Just ignore anything else. + return true; default: - llvm_unreachable("invalid bitwidth"); + llvm_unreachable("invalid operand type"); } } @@ -4154,7 +4439,9 @@ static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: + case AMDGPU::SI_RESTORE_S32_FROM_VGPR: case AMDGPU::V_WRITELANE_B32: + case AMDGPU::SI_SPILL_S32_TO_VGPR: return false; } @@ -4231,8 +4518,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } - if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { - ErrInfo = "missing memory operand from MIMG instruction."; + if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { + ErrInfo = "missing memory operand from image instruction."; return false; } @@ -4276,6 +4563,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } break; } + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: + if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) { + ErrInfo = "Expected inline constant for operand."; + return false; + } + break; case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. @@ -4418,8 +4711,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - // Verify MIMG - if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { + // Verify MIMG / VIMAGE / VSAMPLE + if (isImage(MI.getOpcode()) && !MI.mayStore()) { // Ensure that the return type used is large enough for all the options // being used TFE/LWE require an extra result register. const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); @@ -4683,12 +4976,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - if (isMIMG(MI)) { + if (isImage(MI)) { const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); if (DimOp) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vaddr0); - int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); + int RSrcOpName = + isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; + int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName); const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); @@ -4709,16 +5004,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, IsA16 = A16->getImm() != 0; } - bool IsNSA = SRsrcIdx - VAddr0Idx > 1; + bool IsNSA = RsrcIdx - VAddr0Idx > 1; unsigned AddrWords = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); unsigned VAddrWords; if (IsNSA) { - VAddrWords = SRsrcIdx - VAddr0Idx; - if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) { - unsigned LastVAddrIdx = SRsrcIdx - 1; + VAddrWords = RsrcIdx - VAddr0Idx; + if (ST.hasPartialNSAEncoding() && + AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) { + unsigned LastVAddrIdx = RsrcIdx - 1; VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1; } } else { @@ -4779,20 +5075,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && - ((DstIdx >= 0 && - (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || - Desc.operands()[DstIdx].RegClass == - AMDGPU::VReg_64_Align2RegClassID)) || - ((Src0Idx >= 0 && - (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || - Desc.operands()[Src0Idx].RegClass == - AMDGPU::VReg_64_Align2RegClassID)))) && - !AMDGPU::isLegal64BitDPPControl(DC)) { + !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { ErrInfo = "Invalid dpp_ctrl value: " - "64 bit dpp only support row_newbcast"; + "DP ALU dpp only support row_newbcast"; return false; } } @@ -4884,6 +5170,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } +// It is more readable to list mapped opcodes on the same line. +// clang-format off + unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return AMDGPU::INSTRUCTION_LIST_END; @@ -4960,16 +5249,91 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; + case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64; + case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64; + case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64; + case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64; + case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; + case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; + case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64; + case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64; + case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64; + case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64; + case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64; + case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64; + case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64; + case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64; + case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64; + case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64; + case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64; + case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64; + case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64; + case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64; + case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64; + case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64; + case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64; + case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64; + case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64; + case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64; + case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64; + case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64; + case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; + case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; + case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; + case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64; + case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; + case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; + case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; + case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64; + case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64; + case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64; + case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64; + case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64; + case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64; + case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64; + case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64; + case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64; + case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64; + case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64; + case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64; + case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64; + case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64; + case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64; + case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64; + case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64; + case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64; + case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64; + case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64; + case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64; + case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64; + case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64; + case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64; + case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64; + case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64; + case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64; + case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64; + case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64; + case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64; + case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64; + case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64; + case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64; + case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64; + case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64; + case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64; + case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64; } llvm_unreachable( "Unexpected scalar opcode without corresponding vector one!"); } +// clang-format on + void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive) const { + bool IsSCCLive, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4979,23 +5343,34 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg) const { + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + auto ExecRestoreMI = + BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } static const TargetRegisterClass * @@ -5102,13 +5477,10 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MO.ChangeToRegister(Reg, false); } -unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) - const { +unsigned SIInstrInfo::buildExtractSubReg( + MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, + const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, + unsigned SubIdx, const TargetRegisterClass *SubRC) const { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); @@ -5135,12 +5507,9 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, } MachineOperand SIInstrInfo::buildExtractSubRegOrImm( - MachineBasicBlock::iterator MII, - MachineRegisterInfo &MRI, - MachineOperand &Op, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const { + MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI, + const MachineOperand &Op, const TargetRegisterClass *SuperRC, + unsigned SubIdx, const TargetRegisterClass *SubRC) const { if (Op.isImm()) { if (SubIdx == AMDGPU::sub0) return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); @@ -5235,9 +5604,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; SGPRsUsed.insert(SGPR); } - } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 || - (AMDGPU::isSISrcOperand(InstDesc, i) && - !isInlineConstant(Op, InstDesc.operands()[i]))) { + } else if (AMDGPU::isSISrcOperand(InstDesc, i) && + !isInlineConstant(Op, InstDesc.operands()[i])) { if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) @@ -5285,6 +5653,27 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return true; } + if (MO->isImm()) { + uint64_t Imm = MO->getImm(); + bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64; + bool Is64BitOp = Is64BitFPOp || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; + if (Is64BitOp && + !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { + if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) + return false; + + // FIXME: We can use sign extended 64-bit literals, but only for signed + // operands. At the moment we do not know if an operand is signed. + // Such operand will be encoded as its low 32 bits and then either + // correctly sign extended or incorrectly zero extended by HW. + if (!Is64BitFPOp && (int32_t)Imm < 0) + return false; + } + } + // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); @@ -5342,6 +5731,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) legalizeOpWithMove(MI, Src1Idx); + // Special case: V_FMAC_F32 and V_FMAC_F16 have src2. + if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) { + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg())) + legalizeOpWithMove(MI, Src2Idx); + } + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1)) @@ -5491,6 +5887,11 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, // legalize it. legalizeOpWithMove(MI, Idx); } + + // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst. + if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && + !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) + legalizeOpWithMove(MI, VOP3Idx[2]); } Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, @@ -5862,6 +6263,17 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + // Save SCC. Waterfall Loop may overwrite SCC. + Register SaveSCCReg; + bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) != + MachineBasicBlock::LQR_Dead); + if (SCCNotDead) { + SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg) + .addImm(1) + .addImm(0); + } + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask @@ -5917,8 +6329,15 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); - // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); + // Restore SCC + if (SCCNotDead) { + BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32)) + .addReg(SaveSCCReg, RegState::Kill) + .addImm(0); + } + + // Restore the EXEC mask BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); return BodyBB; } @@ -6103,18 +6522,33 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } - // Legalize MIMG and MUBUF/MTBUF for shaders. + // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM + if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || + MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || + MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || + MI.getOpcode() == AMDGPU::S_WQM_B32 || + MI.getOpcode() == AMDGPU::S_WQM_B64) { + MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) + Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); + return CreatedBB; + } + + // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders. // // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via // scratch memory access. In both cases, the legalization never involves // conversion to the addr64 form. - if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && - (isMUBUF(MI) || isMTBUF(MI)))) { - MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); + if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && + (isMUBUF(MI) || isMTBUF(MI)))) { + int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc + : AMDGPU::OpName::srsrc; + MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); - MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); + int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; + MachineOperand *SSamp = getNamedOperand(MI, SampOpName); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); @@ -6149,13 +6583,26 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, } } + // Legalize s_sleep_var. + if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) { + const DebugLoc &DL = MI.getDebugLoc(); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src0); + Src0.ChangeToRegister(Reg, false); + return nullptr; + } + // Legalize MUBUF instructions. bool isSoffsetLegal = true; int SoffsetIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset); if (SoffsetIdx != -1) { MachineOperand *Soffset = &MI.getOperand(SoffsetIdx); - if (Soffset->isReg() && + if (Soffset->isReg() && Soffset->getReg().isVirtual() && !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) { isSoffsetLegal = false; } @@ -6370,10 +6817,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, default: break; case AMDGPU::S_ADD_U64_PSEUDO: + NewOpcode = AMDGPU::V_ADD_U64_PSEUDO; + break; case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); - return; + NewOpcode = AMDGPU::V_SUB_U64_PSEUDO; + break; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: { // FIXME: The u32 versions currently selected use the carry. @@ -6469,7 +6917,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, break; case AMDGPU::S_LSHL_B64: if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B64_e64; + NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::V_LSHLREV_B64_pseudo_e64 + : AMDGPU::V_LSHLREV_B64_e64; swapOperands(Inst); } break; @@ -6623,21 +7073,98 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::S_CMP_LT_U32: case AMDGPU::S_CMP_LE_U32: case AMDGPU::S_CMP_EQ_U64: - case AMDGPU::S_CMP_LG_U64: { - const MCInstrDesc &NewDesc = get(NewOpcode); + case AMDGPU::S_CMP_LG_U64: + case AMDGPU::S_CMP_LT_F32: + case AMDGPU::S_CMP_EQ_F32: + case AMDGPU::S_CMP_LE_F32: + case AMDGPU::S_CMP_GT_F32: + case AMDGPU::S_CMP_LG_F32: + case AMDGPU::S_CMP_GE_F32: + case AMDGPU::S_CMP_O_F32: + case AMDGPU::S_CMP_U_F32: + case AMDGPU::S_CMP_NGE_F32: + case AMDGPU::S_CMP_NLG_F32: + case AMDGPU::S_CMP_NGT_F32: + case AMDGPU::S_CMP_NLE_F32: + case AMDGPU::S_CMP_NEQ_F32: + case AMDGPU::S_CMP_NLT_F32: + case AMDGPU::S_CMP_LT_F16: + case AMDGPU::S_CMP_EQ_F16: + case AMDGPU::S_CMP_LE_F16: + case AMDGPU::S_CMP_GT_F16: + case AMDGPU::S_CMP_LG_F16: + case AMDGPU::S_CMP_GE_F16: + case AMDGPU::S_CMP_O_F16: + case AMDGPU::S_CMP_U_F16: + case AMDGPU::S_CMP_NGE_F16: + case AMDGPU::S_CMP_NLG_F16: + case AMDGPU::S_CMP_NGT_F16: + case AMDGPU::S_CMP_NLE_F16: + case AMDGPU::S_CMP_NEQ_F16: + case AMDGPU::S_CMP_NLT_F16: { Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) - .add(Inst.getOperand(0)) - .add(Inst.getOperand(1)); + auto NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg) + .setMIFlags(Inst.getFlags()); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src0_modifiers) >= 0) { + NewInstr + .addImm(0) // src0_modifiers + .add(Inst.getOperand(0)) // src0 + .addImm(0) // src1_modifiers + .add(Inst.getOperand(1)) // src1 + .addImm(0); // clamp + } else { + NewInstr + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); MachineOperand SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); + return; + } + case AMDGPU::S_CVT_HI_F32_F16: { + const DebugLoc &DL = Inst.getDebugLoc(); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg) + .addImm(0) // clamp + .addImm(0); // omod + + MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); + Inst.eraseFromParent(); + return; } + case AMDGPU::S_MINIMUM_F32: + case AMDGPU::S_MAXIMUM_F32: + case AMDGPU::S_MINIMUM_F16: + case AMDGPU::S_MAXIMUM_F16: { + const DebugLoc &DL = Inst.getDebugLoc(); + Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .add(Inst.getOperand(1)) + .addImm(0) // src1_modifiers + .add(Inst.getOperand(2)) + .addImm(0) // clamp + .addImm(0); // omod + MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + + legalizeOperands(*NewInstr, MDT); + addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); + Inst.eraseFromParent(); return; } + } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to @@ -6681,8 +7208,61 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Use the new VALU Opcode. auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) .setMIFlags(Inst.getFlags()); - for (const MachineOperand &Op : Inst.explicit_operands()) - NewInstr->addOperand(Op); + if (isVOP3(NewOpcode) && !isVOP3(Opcode)) { + // Intersperse VOP3 modifiers among the SALU operands. + NewInstr->addOperand(Inst.getOperand(0)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src0_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0) + NewInstr->addOperand(Inst.getOperand(1)); + + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + NewInstr.addImm(0); + NewInstr.addImm(Size); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + NewInstr.addImm(0); + } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst.getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second + // operand back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); + } else { + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src1_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0) + NewInstr->addOperand(Inst.getOperand(2)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src2_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0) + NewInstr->addOperand(Inst.getOperand(3)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0) + NewInstr.addImm(0); + } + } else { + // Just copy the SALU operands. + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); + } + // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. @@ -6706,30 +7286,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - NewInstr.addImm(0); - NewInstr.addImm(Size); - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - NewInstr.addImm(0); - } - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - NewInstr->removeOperand(2); - NewInstr.addImm(Offset); - NewInstr.addImm(BitWidth); - } fixImplicitOperands(*NewInstr); // Legalize the operands legalizeOperands(*NewInstr, MDT); @@ -6787,27 +7343,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineOperand &Src1 = Inst.getOperand(2); MachineOperand &Cond = Inst.getOperand(3); - Register SCCSource = Cond.getReg(); - bool IsSCC = (SCCSource == AMDGPU::SCC); + Register CondReg = Cond.getReg(); + bool IsSCC = (CondReg == AMDGPU::SCC); // If this is a trivial select where the condition is effectively not SCC - // (SCCSource is a source of copy to SCC), then the select is semantically - // equivalent to copying SCCSource. Hence, there is no need to create + // (CondReg is a source of copy to SCC), then the select is semantically + // equivalent to copying CondReg. Hence, there is no need to create // V_CNDMASK, we can just use that and bail out. if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && (Src1.getImm() == 0)) { - MRI.replaceRegWith(Dest.getReg(), SCCSource); + MRI.replaceRegWith(Dest.getReg(), CondReg); return; } - const TargetRegisterClass *TC = - RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register CopySCC = MRI.createVirtualRegister(TC); - + Register NewCondReg = CondReg; if (IsSCC) { + const TargetRegisterClass *TC = + RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + NewCondReg = MRI.createVirtualRegister(TC); + // Now look for the closest SCC def if it is a copy - // replacing the SCCSource with the COPY source register + // replacing the CondReg with the COPY source register bool CopyFound = false; for (MachineInstr &CandI : make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), @@ -6815,7 +7371,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) { if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { - BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg) .addReg(CandI.getOperand(1).getReg()); CopyFound = true; } @@ -6830,24 +7386,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; auto NewSelect = - BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0); NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); } } - Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - auto UpdatedInst = - BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) - .addImm(0) - .add(Src1) // False - .addImm(0) - .add(Src0) // True - .addReg(IsSCC ? CopySCC : SCCSource); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - legalizeOperands(*UpdatedInst, MDT); - addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + Register NewDestReg = MRI.createVirtualRegister( + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()))); + MachineInstr *NewInst; + if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) { + NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(NewCondReg); + } else { + NewInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg) + .add(Src1) // False + .add(Src0) // True + .addReg(NewCondReg); + } + MRI.replaceRegWith(Dest.getReg(), NewDestReg); + legalizeOperands(*NewInst, MDT); + addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist); } void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, @@ -7059,80 +7622,6 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist, - MachineInstr &Inst, - MachineDominatorTree *MDT) const { - bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); - - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - Register CarryReg = MRI.createVirtualRegister(CarryRC); - Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); - - MachineOperand &Dest = Inst.getOperand(0); - MachineOperand &Src0 = Inst.getOperand(1); - MachineOperand &Src1 = Inst.getOperand(2); - const DebugLoc &DL = Inst.getDebugLoc(); - MachineBasicBlock::iterator MII = Inst; - - const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); - const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); - const TargetRegisterClass *Src0SubRC = - RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); - const TargetRegisterClass *Src1SubRC = - RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); - - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub0, Src0SubRC); - MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub0, Src1SubRC); - - - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - - unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; - MachineInstr *LoHalf = - BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) - .addReg(CarryReg, RegState::Define) - .add(SrcReg0Sub0) - .add(SrcReg1Sub0) - .addImm(0); // clamp bit - - unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; - MachineInstr *HiHalf = - BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) - .addReg(DeadCarryReg, RegState::Define | RegState::Dead) - .add(SrcReg0Sub1) - .add(SrcReg1Sub1) - .addReg(CarryReg, RegState::Kill) - .addImm(0); // clamp bit - - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - legalizeOperands(*LoHalf, MDT); - legalizeOperands(*HiHalf, MDT); - - // Move all users of this moved value. - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { @@ -7980,9 +8469,36 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { return ArrayRef(TargetFlags); } -bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { - return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && - MI.modifiesRegister(AMDGPU::EXEC, &RI); +unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, + const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + assert(SrcReg.isVirtual()); + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return AMDGPU::WWM_COPY; + + return AMDGPU::COPY; +} + +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, + Register Reg) const { + // We need to handle instructions which may be inserted during register + // allocation to handle the prolog. The initial prolog instruction may have + // been separated from the start of the block by spills and copies inserted + // needed by the prolog. However, the insertions for scalar registers can + // always be placed at the BB top as they are independent of the exec mask + // value. + bool IsNullOrVectorRegister = true; + if (Reg) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); + } + + uint16_t Opc = MI.getOpcode(); + // FIXME: Copies inserted in the block prolog for live-range split should also + // be included. + return IsNullOrVectorRegister && + (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI))); } MachineInstrBuilder @@ -8045,7 +8561,16 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con } } -unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; } +bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const { + return Imm <= getMaxMUBUFImmOffset(ST); +} + +unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) { + // GFX12 field is non-negative 24-bit signed byte offset. + const unsigned OffsetBits = + ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12; + return (1 << OffsetBits) - 1; +} void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { if (!ST.isWave32()) @@ -8082,7 +8607,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { // offsets within the given alignment can be added to the resulting ImmOffset. bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment) const { - const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST); const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value()); uint32_t Overflow = 0; @@ -8108,11 +8633,17 @@ bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, } } - // There is a hardware bug in SI and CI which prevents address clamping in - // MUBUF instructions from working correctly with SOffsets. The immediate - // offset is unaffected. - if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) - return false; + if (Overflow > 0) { + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + // It is not possible to set immediate in SOffset field on some targets. + if (ST.hasRestrictedSOffset()) + return false; + } ImmOffset = Imm; SOffset = Overflow; @@ -8160,16 +8691,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) return false; - bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; - if (ST.hasNegativeScratchOffsetBug() && - FlatVariant == SIInstrFlags::FlatScratch) - AllowNegative = false; if (ST.hasNegativeUnalignedScratchOffsetBug() && FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && (Offset % 4) != 0) { return false; } + bool AllowNegative = allowNegativeFlatOffset(FlatVariant); unsigned N = AMDGPU::getNumFlatOffsetBits(ST); return isIntN(N, Offset) && (AllowNegative || Offset >= 0); } @@ -8180,12 +8708,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; - bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; - if (ST.hasNegativeScratchOffsetBug() && - FlatVariant == SIInstrFlags::FlatScratch) - AllowNegative = false; + bool AllowNegative = allowNegativeFlatOffset(FlatVariant); const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1; + if (AllowNegative) { // Use signed division by a power of two to truncate towards 0. int64_t D = 1LL << NumBits; @@ -8209,6 +8735,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, return {ImmField, RemainderOffset}; } +bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const { + if (ST.hasNegativeScratchOffsetBug() && + FlatVariant == SIInstrFlags::FlatScratch) + return false; + + return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST); +} + static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { switch (ST.getGeneration()) { default: @@ -8223,6 +8757,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { return SIEncodingFamily::GFX10; case AMDGPUSubtarget::GFX11: return SIEncodingFamily::GFX11; + case AMDGPUSubtarget::GFX12: + return SIEncodingFamily::GFX12; } llvm_unreachable("Unknown subtarget generation!"); } @@ -8248,6 +8784,9 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { + if (SIInstrInfo::isSoftWaitcnt(Opcode)) + Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode); + unsigned Gen = subtargetEncodingFamily(ST); if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && @@ -8282,6 +8821,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + // TODO-GFX12: Remove this. + // Hack to allow some GFX12 codegen tests to run before all the encodings are + // implemented. + if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12) + MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11); + // -1 means that Opcode is already a native instruction. if (MCOp == -1) return Opcode; @@ -8531,7 +9076,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. - if (MI.isFullCopy()) { + if (isFullCopyInstr(MI)) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); if ((DstReg.isVirtual() || SrcReg.isVirtual()) && @@ -8572,9 +9117,8 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::G_INTRINSIC || - opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) { - auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID()); + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { + auto IID = GI->getIntrinsicID(); if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) return InstructionUniformity::NeverUniform; if (AMDGPU::isIntrinsicAlwaysUniform(IID)) @@ -8612,7 +9156,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) || opcode == AMDGPU::G_ATOMIC_CMPXCHG || - opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) { + opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS || + AMDGPU::isGenericAtomic(opcode)) { return InstructionUniformity::NeverUniform; } return InstructionUniformity::Default; @@ -8625,10 +9170,12 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::NeverUniform; unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) + if (opcode == AMDGPU::V_READLANE_B32 || + opcode == AMDGPU::V_READFIRSTLANE_B32 || + opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR) return InstructionUniformity::AlwaysUniform; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { const MachineOperand &srcOp = MI.getOperand(1); if (srcOp.isReg() && srcOp.getReg().isPhysical()) { const TargetRegisterClass *regClass = diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 66f93e5640d6..affe52046752 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -43,7 +43,7 @@ static const MachineMemOperand::Flags MONoClobber = /// Utility to store machine instructions worklist. struct SIInstrWorklist { - SIInstrWorklist() : InstrList() {} + SIInstrWorklist() = default; void insert(MachineInstr *MI); @@ -102,16 +102,15 @@ private: public: unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, - MachineOperand &SuperReg, + const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const; - MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const; + MachineOperand buildExtractSubRegOrImm( + MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, + const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, + unsigned SubIdx, const TargetRegisterClass *SubRC) const; + private: void swapOperands(MachineInstr &Inst) const; @@ -135,9 +134,6 @@ private: void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, bool Swap = false) const; - void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; @@ -170,6 +166,12 @@ private: Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + std::optional<DestSourcePair> + isCopyInstrImpl(const MachineInstr &MI) const override; + bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; @@ -216,6 +218,9 @@ public: bool isIgnorableUse(const MachineOperand &MO) const override; + bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, + MachineCycleInfo *CI) const override; + bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override; @@ -226,8 +231,11 @@ public: const TargetRegisterInfo *TRI) const final; bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + int64_t Offset1, bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, - unsigned NumLoads, unsigned NumBytes) const override; + int64_t Offset2, bool OffsetIsScalable2, + unsigned ClusterSize, + unsigned NumBytes) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override; @@ -266,6 +274,11 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + Register DestReg, unsigned SubIdx, + const MachineInstr &Orig, + const TargetRegisterInfo &TRI) const override; + // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp // instructions. Returns a pair of generated instructions. // Can split either post-RA with physical registers or pre-RA with @@ -395,12 +408,20 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isImage(const MachineInstr &MI) { + return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI); + } + + bool isImage(uint16_t Opcode) const { + return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode); + } + static bool isVMEM(const MachineInstr &MI) { - return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI); + return isMUBUF(MI) || isMTBUF(MI) || isImage(MI); } bool isVMEM(uint16_t Opcode) const { - return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode); + return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode); } static bool isSOP1(const MachineInstr &MI) { @@ -525,6 +546,22 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DS; } + static bool isLDSDMA(const MachineInstr &MI) { + return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI)); + } + + bool isLDSDMA(uint16_t Opcode) { + return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode)); + } + + static bool isGWS(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::GWS; + } + + bool isGWS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::GWS; + } + bool isAlwaysGDS(uint16_t Opcode) const; static bool isMIMG(const MachineInstr &MI) { @@ -535,6 +572,22 @@ public: return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isVIMAGE(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE; + } + + bool isVIMAGE(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VIMAGE; + } + + static bool isVSAMPLE(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE; + } + + bool isVSAMPLE(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE; + } + static bool isGather4(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::Gather4; } @@ -622,6 +675,10 @@ public: SIInstrFlags::IsAtomicNoRet); } + static bool mayWriteLDSThroughDMA(const MachineInstr &MI) { + return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; + } + static bool isWQM(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::WQM; } @@ -654,9 +711,21 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; } + bool isSpillOpcode(uint16_t Opcode) const { + return get(Opcode).TSFlags & + (SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill); + } + static bool isWWMRegSpillOpcode(uint16_t Opcode) { return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || - Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; + Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE || + Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE || + Opcode == AMDGPU::SI_SPILL_WWM_AV32_RESTORE; + } + + static bool isChainCallOpcode(uint64_t Opcode) { + return Opcode == AMDGPU::SI_CS_CHAIN_TC_W32 || + Opcode == AMDGPU::SI_CS_CHAIN_TC_W64; } static bool isDPP(const MachineInstr &MI) { @@ -826,8 +895,34 @@ public: return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead; } + static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { + if (isWaitcnt(Opcode)) + return AMDGPU::S_WAITCNT; + + if (isWaitcntVsCnt(Opcode)) + return AMDGPU::S_WAITCNT_VSCNT; + + llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT"); + } + + static bool isWaitcnt(unsigned Opcode) { + return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft; + } + + static bool isWaitcntVsCnt(unsigned Opcode) { + return Opcode == AMDGPU::S_WAITCNT_VSCNT || + Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; + } + + // "Soft" waitcnt instructions can be relaxed/optimized out by + // SIInsertWaitcnts. + static bool isSoftWaitcnt(unsigned Opcode) { + return Opcode == AMDGPU::S_WAITCNT_soft || + Opcode == AMDGPU::S_WAITCNT_VSCNT_soft; + } + bool isVGPRCopy(const MachineInstr &MI) const { - assert(MI.isCopy()); + assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -897,7 +992,7 @@ public: if (OpIdx >= MI.getDesc().NumOperands) return false; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { unsigned Size = getOpSize(MI, OpIdx); assert(Size == 8 || Size == 4); @@ -946,12 +1041,12 @@ public: void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - bool IsSCCLive) const; + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) const; + Register Reg, SlotIndexes *Indexes = nullptr) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined @@ -1143,7 +1238,11 @@ public: CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override; - bool isBasicBlockPrologue(const MachineInstr &MI) const override; + unsigned getLiveRangeSplitOpcode(Register Reg, + const MachineFunction &MF) const override; + + bool isBasicBlockPrologue(const MachineInstr &MI, + Register Reg = Register()) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, @@ -1176,11 +1275,9 @@ public: static bool isKillTerminator(unsigned Opcode); const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; - static bool isLegalMUBUFImmOffset(unsigned Imm) { - return isUInt<12>(Imm); - } + bool isLegalMUBUFImmOffset(unsigned Imm) const; - static unsigned getMaxMUBUFImmOffset(); + static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST); bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment = Align(4)) const; @@ -1197,6 +1294,9 @@ public: unsigned AddrSpace, uint64_t FlatVariant) const; + /// Returns true if negative offsets are allowed for the given \p FlatVariant. + bool allowNegativeFlatOffset(uint64_t FlatVariant) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. @@ -1378,6 +1478,13 @@ namespace AMDGPU { } // end namespace AMDGPU +namespace AMDGPU { +enum AsmComments { + // For sgpr to vgpr spill instructions + SGPR_SPILL = MachineInstr::TAsmComments +}; +} // namespace AMDGPU + namespace SI { namespace KernelInputOffsets { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 044bc4507d3a..173c877b8d29 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -32,6 +32,7 @@ def SIEncodingFamily { int GFX90A = 8; int GFX940 = 9; int GFX11 = 10; + int GFX12 = 11; } //===----------------------------------------------------------------------===// @@ -158,36 +159,18 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -class SDBufferAtomic<string opcode> : SDNode <opcode, - SDTypeProfile<1, 8, - [SDTCisVT<2, v4i32>, // rsrc - SDTCisVT<3, i32>, // vindex(VGPR) - SDTCisVT<4, i32>, // voffset(VGPR) - SDTCisVT<5, i32>, // soffset(SGPR) - SDTCisVT<6, i32>, // offset(imm) - SDTCisVT<7, i32>, // cachepolicy(imm) - SDTCisVT<8, i1>]>, // idxen(imm) - [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] ->; - -def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; -def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; -def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; -def SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">; -def SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">; -def SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">; -def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; -def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; -def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; -def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; -def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; -def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; -def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; -def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; -def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; -def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; - -multiclass SDBufferAtomicNoRet { +multiclass SDBufferAtomic<string opcode> { + def "" : SDNode <opcode, + SDTypeProfile<1, 8, + [SDTCisVT<2, v4i32>, // rsrc + SDTCisVT<3, i32>, // vindex(VGPR) + SDTCisVT<4, i32>, // voffset(VGPR) + SDTCisVT<5, i32>, // soffset(SGPR) + SDTCisVT<6, i32>, // offset(imm) + SDTCisVT<7, i32>, // cachepolicy(imm) + SDTCisVT<8, i1>]>, // idxen(imm) + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] + >; def "_noret" : PatFrag< (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), @@ -198,28 +181,26 @@ multiclass SDBufferAtomicNoRet { } } -defm SIbuffer_atomic_swap : SDBufferAtomicNoRet; -defm SIbuffer_atomic_add : SDBufferAtomicNoRet; -defm SIbuffer_atomic_sub : SDBufferAtomicNoRet; -defm SIbuffer_atomic_smin : SDBufferAtomicNoRet; -defm SIbuffer_atomic_umin : SDBufferAtomicNoRet; -defm SIbuffer_atomic_smax : SDBufferAtomicNoRet; -defm SIbuffer_atomic_umax : SDBufferAtomicNoRet; -defm SIbuffer_atomic_and : SDBufferAtomicNoRet; -defm SIbuffer_atomic_or : SDBufferAtomicNoRet; -defm SIbuffer_atomic_xor : SDBufferAtomicNoRet; -defm SIbuffer_atomic_inc : SDBufferAtomicNoRet; -defm SIbuffer_atomic_dec : SDBufferAtomicNoRet; -defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet; -defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet; -defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; +defm SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; +defm SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; +defm SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">; +defm SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">; +defm SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">; +defm SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; +defm SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; +defm SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; +defm SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; +defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; +defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; +defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; +defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, - [SDTCisVT<0, i32>, // dst - SDTCisVT<1, i32>, // src - SDTCisVT<2, i32>, // cmp - SDTCisVT<3, v4i32>, // rsrc + [SDTCisVT<3, v4i32>, // rsrc SDTCisVT<4, i32>, // vindex(VGPR) SDTCisVT<5, i32>, // voffset(VGPR) SDTCisVT<6, i32>, // soffset(SGPR) @@ -604,14 +585,14 @@ def atomic_store_64_glue : PatFrag < } let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces in { -def atomic_store_8_local_m0 : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_8_glue node:$ptr, node:$val)>; -def atomic_store_16_local_m0 : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_16_glue node:$ptr, node:$val)>; -def atomic_store_32_local_m0 : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_32_glue node:$ptr, node:$val)>; -def atomic_store_64_local_m0 : PatFrag<(ops node:$ptr, node:$val), - (atomic_store_64_glue node:$ptr, node:$val)>; +def atomic_store_8_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_8_glue node:$val, node:$ptr)>; +def atomic_store_16_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_16_glue node:$val, node:$ptr)>; +def atomic_store_32_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_32_glue node:$val, node:$ptr)>; +def atomic_store_64_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (atomic_store_64_glue node:$val, node:$ptr)>; } // End let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces @@ -906,11 +887,19 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ //===----------------------------------------------------------------------===// def extract_cpol : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & AMDGPU::CPol::ALL, SDLoc(N), MVT::i8); + return CurDAG->getTargetConstant( + N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12), + SDLoc(N), MVT::i8); }]>; def extract_swz : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); + const bool Swizzle = + N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 + ? AMDGPU::CPol::SWZ + : AMDGPU::CPol::SWZ_pregfx12); + return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8); }]>; def set_glc : SDNodeXForm<timm, [{ @@ -938,6 +927,13 @@ def InterpAttr : CustomOperand<i32>; def InterpAttrChan : ImmOperand<i32>; +def SplitBarrier : ImmOperand<i32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_INLINE_SPLIT_BARRIER_INT32"; + let DecoderMethod = "decodeSplitBarrier"; + let PrintMethod = "printOperand"; +} + def VReg32OrOffClass : AsmOperandClass { let Name = "VReg32OrOff"; let ParserMethod = "parseVReg32OrOff"; @@ -1044,6 +1040,7 @@ class NamedIntOperand<ValueType Type, string Prefix, string Name = NAME, class NamedBitOperand<string Id, string Name = NAME> : CustomOperand<i1, 1, Name> { + let PredicateMethod = "isImmTy<AMDGPUOperand::"#ImmTy#">"; let ParserMethod = "[this](OperandVector &Operands) -> ParseStatus { "# "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }"; @@ -1054,8 +1051,8 @@ class NamedBitOperand<string Id, string Name = NAME> class DefaultOperand<CustomOperand Op, int Value> : OperandWithDefaultOps<Op.Type, (ops (Op.Type Value))>, - CustomOperandProps<1, Op.ParserMatchClass.Name> { - let ParserMethod = Op.ParserMatchClass.ParserMethod; + CustomOperandProps<1> { + let ParserMatchClass = Op.ParserMatchClass; let PrintMethod = Op.PrintMethod; } @@ -1096,6 +1093,10 @@ def highmod : NamedBitOperand<"high", "High">; def CPol : CustomOperand<i32, 1>; def CPol_0 : DefaultOperand<CPol, 0>; def CPol_GLC1 : DefaultOperand<CPol, 1>; +def CPol_GLC : ValuePredicatedOperand<CPol, "Op.getImm() & CPol::GLC">; +def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>; +def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>; +def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>; def TFE : NamedBitOperand<"tfe">; def UNorm : NamedBitOperand<"unorm">; @@ -1170,6 +1171,10 @@ class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> { } def FP16InputModsMatchClass : FPInputModsMatchClass<16>; +def FPT16InputModsMatchClass : FPInputModsMatchClass<16> { + let Name = "RegOrImmWithFPT16InputMods"; + let PredicateMethod = "isRegOrImmWithFPT16InputMods"; +} def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; @@ -1187,6 +1192,7 @@ class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> { } def FP16InputMods : FPInputMods<FP16InputModsMatchClass>; +def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>; def FP32InputMods : FPInputMods<FP32InputModsMatchClass>; def FP64InputMods : FPInputMods<FP64InputModsMatchClass>; @@ -1202,6 +1208,10 @@ class IntVCSrcInputModsMatchClass <int opSize> : IntInputModsMatchClass <opSize> let Name = "RegOrInlineImmWithInt"#opSize#"InputMods"; let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods"; } +def IntT16InputModsMatchClass : IntInputModsMatchClass<16> { + let Name = "RegOrImmWithIntT16InputMods"; + let PredicateMethod = "isRegOrImmWithIntT16InputMods"; +} def Int32InputModsMatchClass : IntInputModsMatchClass<32>; def Int64InputModsMatchClass : IntInputModsMatchClass<64>; def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>; @@ -1209,6 +1219,7 @@ def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>; class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> { let PrintMethod = "printOperandAndIntInputMods"; } +def IntT16InputMods : IntInputMods<IntT16InputModsMatchClass>; def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>; @@ -1463,15 +1474,18 @@ class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { // Returns the register class to use for the destination of VOP[123C] // instructions for the given VT. -class getVALUDstForVT<ValueType VT> { +class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { + defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16, + VOPDstOperand_t16Lo128), + VOPDstOperand<VGPR_32>); RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, - !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + !if(!eq(VT.Size, 16), op16, VOPDstS64orS32)))); // else VT == i1 } -class getVALUDstForVT_t16<ValueType VT> { +class getVALUDstForVT_fake16<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, @@ -1489,7 +1503,7 @@ class getSDWADstForVT<ValueType VT> { // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. -class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> { +class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> { bit isFP = isFloatType<VT>.ret; RegisterOperand ret = @@ -1498,7 +1512,7 @@ class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> { VSrc_f64, !if(!eq(VT.Value, f16.Value), !if(IsTrue16, - VSrcT_f16_Lo128, + !if(IsFake16, VSrcFake16_f16_Lo128, VSrcT_f16_Lo128), VSrc_f16 ), !if(!eq(VT.Value, v2f16.Value), @@ -1514,7 +1528,7 @@ class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> { VSrc_b64, !if(!eq(VT.Value, i16.Value), !if(IsTrue16, - VSrcT_b16_Lo128, + !if(IsFake16, VSrcFake16_b16_Lo128, VSrcT_b16_Lo128), VSrc_b16 ), !if(!eq(VT.Value, v2i16.Value), @@ -1539,13 +1553,17 @@ class getVregSrcForVT<ValueType VT> { VGPR_32)))); } -class getVregSrcForVT_t16<ValueType VT> { +class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> { RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128, !if(!eq(VT.Size, 96), VReg_96, !if(!eq(VT.Size, 64), VReg_64, !if(!eq(VT.Size, 48), VReg_64, - !if(!eq(VT.Size, 16), VGPR_32_Lo128, + !if(!eq(VT.Size, 16), + !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128), VGPR_32))))); + + RegisterOperand op = !if (!and(!eq(VT.Size, 16), !not(IsFake16)), + VGPRSrc_16_Lo128, RegisterOperand<ret>); } class getSDWASrcForVT <ValueType VT> { @@ -1557,7 +1575,7 @@ class getSDWASrcForVT <ValueType VT> { // Returns the register class to use for sources of VOP3 instructions for the // given VT. -class getVOP3SrcForVT<ValueType VT> { +class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), @@ -1574,7 +1592,7 @@ class getVOP3SrcForVT<ValueType VT> { SSrc_i1, !if(isFP, !if(!eq(VT.Value, f16.Value), - VSrc_f16, + !if(IsTrue16, VSrcT_f16, VSrc_f16), !if(!eq(VT.Value, v2f16.Value), VSrc_v2f16, !if(!eq(VT.Value, v4f16.Value), @@ -1584,7 +1602,7 @@ class getVOP3SrcForVT<ValueType VT> { ) ), !if(!eq(VT.Value, i16.Value), - VSrc_b16, + !if(IsTrue16, VSrcT_b16, VSrc_b16), !if(!eq(VT.Value, v2i16.Value), VSrc_v2b16, VSrc_b32 @@ -1631,18 +1649,15 @@ class isModifierType<ValueType SrcVT> { } // Return type of input modifiers operand for specified input operand -class getSrcMod <ValueType VT> { +class getSrcMod <ValueType VT, bit IsTrue16 = 0> { bit isFP = isFloatType<VT>.ret; bit isPacked = isPackedType<VT>.ret; Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), - !if(isFP, - !if(!eq(VT.Value, f16.Value), - FP16InputMods, - FP32InputMods - ), - Int32InputMods) - ); + !if(!eq(VT.Size, 16), + !if(isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods), + !if(IsTrue16, IntT16InputMods, IntOpSelMods)), + !if(isFP, FP32InputMods, Int32InputMods))); } class getOpSelMod <ValueType VT> { @@ -2262,6 +2277,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field list<ValueType> ArgVT = _ArgVT; field bit EnableClamp = _EnableClamp; field bit IsTrue16 = 0; + field bit IsRealTrue16 = 0; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; @@ -2281,7 +2297,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret; field RegisterOperand Src0VOP3DPP = VGPRSrc_32; - field RegisterOperand Src1VOP3DPP = VRegSrc_32; + field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret; field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; @@ -2454,8 +2470,32 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p. // class, so copy changes to this class in those profiles class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> { let IsTrue16 = 1; + let IsRealTrue16 = 1; + // Most DstVT are 16-bit, but not all. + let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret; + let DstRC64 = getVALUDstForVT<DstVT>.ret; + let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret; + let Src1RC32 = getVregSrcForVT_t16<Src1VT, 0 /*IsFake16*/>.op; + let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; + let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret; + let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret; + let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret; + let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret; + let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret; + + let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret; + let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret; + let Src1RC64 = getVOP3SrcForVT<Src1VT, 1 /*IsTrue16*/>.ret; + let Src2RC64 = getVOP3SrcForVT<Src2VT, 1 /*IsTrue16*/>.ret; + let Src0Mod = getSrcMod<Src0VT, 1 /*IsTrue16*/>.ret; + let Src1Mod = getSrcMod<Src1VT, 1 /*IsTrue16*/>.ret; + let Src2Mod = getSrcMod<Src2VT, 1 /*IsTrue16*/>.ret; +} + +class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> { + let IsTrue16 = 1; // Most DstVT are 16-bit, but not all - let DstRC = getVALUDstForVT_t16<DstVT>.ret; + let DstRC = getVALUDstForVT_fake16<DstVT>.ret; let DstRC64 = getVALUDstForVT<DstVT>.ret; let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>; let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret; @@ -2733,7 +2773,8 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.SDWA10)], [!cast<string>(SIEncodingFamily.GFX90A)], [!cast<string>(SIEncodingFamily.GFX940)], - [!cast<string>(SIEncodingFamily.GFX11)]]; + [!cast<string>(SIEncodingFamily.GFX11)], + [!cast<string>(SIEncodingFamily.GFX12)]]; } // Get equivalent SOPK instruction. @@ -2872,14 +2913,14 @@ def getVOPDBaseFromComponent : SearchIndex { def VOPDPairs : GenericTable { let FilterClass = "VOPD_Base"; let CppTypeName = "VOPDInfo"; - let Fields = ["Opcode", "OpX", "OpY"]; + let Fields = ["Opcode", "OpX", "OpY", "SubTgt"]; let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getVOPDOpcodeHelper"; } def getVOPDInfoFromComponentOpcodes : SearchIndex { let Table = VOPDPairs; - let Key = ["OpX", "OpY"]; + let Key = ["OpX", "OpY", "SubTgt"]; } include "SIInstructions.td" diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index 2edebccef7d8..f9bc623abcd0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -132,7 +132,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), let isAsCheapAsAMove = 1; let isMoveImm = 1; let SchedRW = [Write64Bit]; - let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. + let Size = 4; let UseNamedOperandTable = 1; } @@ -149,8 +149,9 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), let isAsCheapAsAMove = 1; let isMoveImm = 1; let SchedRW = [WriteSALU, Write64Bit]; - let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. + let Size = 4; let Uses = []; + let UseNamedOperandTable = 1; } // Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the @@ -172,6 +173,13 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def WWM_COPY : SPseudoInstSI < + (outs unknown:$dst), (ins unknown:$src)> { + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let isConvergent = 1; +} + def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; @@ -251,6 +259,12 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), } } // End Defs = [SCC] +def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), + (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; + +def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)), + (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>; + let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), @@ -263,7 +277,7 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses } } -let usesCustomInserter = 1, Defs = [VCC, EXEC] in { +let usesCustomInserter = 1, Defs = [VCC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] @@ -273,7 +287,7 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] >; -} // End usesCustomInserter = 1, Defs = [VCC, EXEC] +} // End usesCustomInserter = 1, Defs = [VCC] let usesCustomInserter = 1, Defs = [SCC] in { def S_ADD_U64_PSEUDO : SPseudoInstSI < @@ -657,6 +671,50 @@ def : GCNPat< (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) >; +// Pseudo for the llvm.amdgcn.cs.chain intrinsic. +// This is essentially a tail call, but it also takes a mask to put in EXEC +// right before jumping to the callee. +class SI_CS_CHAIN_TC< + ValueType execvt, Predicate wavesizepred, + RegisterOperand execrc = getSOPSrcForVT<execvt>.ret> + : SPseudoInstSI <(outs), + (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)> { + let FixedSize = 0; + let isCall = 1; + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; + let isConvergent = 1; + + let WaveSizePredicate = wavesizepred; +} + +def SI_CS_CHAIN_TC_W32 : SI_CS_CHAIN_TC<i32, isWave32>; +def SI_CS_CHAIN_TC_W64 : SI_CS_CHAIN_TC<i64, isWave64>; + +// Handle selecting direct & indirect calls via SI_CS_CHAIN_TC_W32/64 +multiclass si_cs_chain_tc_pattern< + dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> { +def : GCNPat< + (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec), + (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) +>; +} + +multiclass si_cs_chain_tc_patterns< + ValueType execvt, + RegisterOperand execrc = getSOPSrcForVT<execvt>.ret, + Instruction tc = !if(!eq(execvt, i32), SI_CS_CHAIN_TC_W32, SI_CS_CHAIN_TC_W64) + > { + defm direct: si_cs_chain_tc_pattern<(tglobaladdr:$callee), execvt, execrc, tc>; + defm indirect: si_cs_chain_tc_pattern<(i64 0), execvt, execrc, tc>; +} + +defm : si_cs_chain_tc_patterns<i32>; +defm : si_cs_chain_tc_patterns<i64>; + def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), [(callseq_start timm:$amt0, timm:$amt1)], @@ -867,6 +925,28 @@ defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; +let SGPRSpill = 1, VALU = 1, isConvergent = 1 in { +def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst), + (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> { + let Size = 4; + let FixedSize = 1; + let IsNeverUniform = 1; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Constraints = "$vdst = $vdst_in"; +} + +def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), + (ins VGPR_32:$src0, i32imm:$src1)> { + let Size = 4; + let FixedSize = 1; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} +} // End SGPRSpill = 1, VALU = 1, isConvergent = 1 + // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register // needs to be used and an extra instruction to move between VGPR and AGPR. // UsesTmp adds to the total size of an expanded spill in this case. @@ -945,8 +1025,10 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; -let isConvergent = 1 in -defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; +let isConvergent = 1 in { + defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; + defm SI_SPILL_WWM_AV32 : SI_SPILL_VGPR <AV_32, 1>; +} def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), @@ -1587,6 +1669,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>; def : BitConvert <v12f32, v12i32, VReg_384>; // 512-bit bitcast +def : BitConvert <v32f16, v32i16, VReg_512>; +def : BitConvert <v32i16, v32f16, VReg_512>; +def : BitConvert <v32f16, v16i32, VReg_512>; +def : BitConvert <v32f16, v16f32, VReg_512>; +def : BitConvert <v16f32, v32f16, VReg_512>; +def : BitConvert <v16i32, v32f16, VReg_512>; +def : BitConvert <v32i16, v16i32, VReg_512>; +def : BitConvert <v32i16, v16f32, VReg_512>; +def : BitConvert <v16f32, v32i16, VReg_512>; +def : BitConvert <v16i32, v32i16, VReg_512>; def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; def : BitConvert <v8i64, v8f64, VReg_512>; @@ -1632,8 +1724,10 @@ def : ClampPat<V_MAX_F32_e64, f32>; def : ClampPat<V_MAX_F64_e64, f64>; let SubtargetPredicate = NotHasTrue16BitInsts in def : ClampPat<V_MAX_F16_e64, f16>; -let SubtargetPredicate = HasTrue16BitInsts in +let SubtargetPredicate = UseRealTrue16Insts in def : ClampPat<V_MAX_F16_t16_e64, f16>; +let SubtargetPredicate = UseFakeTrue16Insts in +def : ClampPat<V_MAX_F16_fake16_e64, f16>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1922,6 +2016,29 @@ def : GCNPat < (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) >; +// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit +// immediate and wil be expanded as needed, but we will only use these patterns +// for values which can be encoded. +def : GCNPat < + (VGPRImm<(i64 imm)>:$imm), + (V_MOV_B64_PSEUDO imm:$imm) +>; + +def : GCNPat < + (VGPRImm<(f64 fpimm)>:$imm), + (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) +>; + +def : GCNPat < + (i64 imm:$imm), + (S_MOV_B64_IMM_PSEUDO imm:$imm) +>; + +def : GCNPat < + (f64 fpimm:$imm), + (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm))) +>; + def : GCNPat < (f32 fpimm:$imm), (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) @@ -2306,8 +2423,16 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; -def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; -def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +let OtherPredicates = [NotHasTrue16BitInsts] in { + def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +} // end OtherPredicates = [NotHasTrue16BitInsts] + +let OtherPredicates = [HasTrue16BitInsts] in { + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; + def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; +} // end OtherPredicates = [HasTrue16BitInsts] + def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; @@ -2679,12 +2804,12 @@ def : GCNPat< let OtherPredicates = [HasTrue16BitInsts] in { def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) + (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) >; def : GCNPat< (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) + (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) >; } // End OtherPredicates @@ -2703,6 +2828,13 @@ def : GCNPat< (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) >; +let SubtargetPredicate = HasPackedFP32Ops in { +def : GCNPat< + (fcanonicalize (v2f32 (VOP3PMods v2f32:$src, i32:$src_mods))), + (V_PK_MUL_F32 0, CONST.FP32_ONE, $src_mods, $src) +>; +} + // TODO: Handle fneg like other types. def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), @@ -2734,7 +2866,7 @@ multiclass SelectCanonicalizeAsMax< def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { + (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]); } @@ -3309,6 +3441,81 @@ defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>; defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>; } // End Predicates = [isGFX9Plus] +let OtherPredicates = [isGFX12Plus] in { +def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; +def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; +} + +// Convert a floating-point power of 2 to the integer exponent. +def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{ + const auto &APF = N->getValueAPF(); + int Log2 = APF.getExactLog2Abs(); + assert(Log2 != INT_MIN); + return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32); +}]>; + +// Check if a floating point value is a power of 2 floating-point +// immediate where it's preferable to emit a multiply by as an +// ldexp. We skip over 0.5 to 4.0 as those are inline immediates +// anyway. +def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ + if (Imm.isNegative()) + return false; + + int Exp = Imm.getExactLog2Abs(); + // Prefer leaving the FP inline immediates as they are. + // 0.5, 1.0, 2.0, 4.0 + + // For f64 ldexp is always better than materializing a 64-bit + // constant. + return Exp != INT_MIN && (Exp < -1 || Exp > 2); + }], FPPow2ToExponentXForm +>; + +def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ + if (!Imm.isNegative()) + return false; + int Exp = Imm.getExactLog2Abs(); + // Prefer leaving the FP inline immediates as they are. + // 0.5, 1.0, 2.0, 4.0 + + // For f64 ldexp is always better than materializing a 64-bit + // constant. + return Exp != INT_MIN && (Exp < -1 || Exp > 2); + }], FPPow2ToExponentXForm +>; + +// f64 is different because we also want to handle cases that may +// require materialization of the exponent. +// TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma +// TODO: For f32/f16, it's not a clear win on code size to use ldexp +// in place of mul since we have to use the vop3 form. Are there power +// savings or some other reason to prefer ldexp over mul? +def : GCNPat< + (any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)), + fpimm_pos_pow2_prefer_ldexp_f64:$src1), + (V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0, + 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) +>; + +def : GCNPat< + (any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1), + (V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0, + 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) +>; + +// We want to avoid using VOP3Mods which could pull in another fneg +// which we would need to be re-negated (which should never happen in +// practice). I don't see a way to apply an SDNodeXForm that accounts +// for a second operand. +def : GCNPat< + (any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1), + (V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0, + 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) +>; + class AMDGPUGenericInstruction : GenericInstruction { let Namespace = "AMDGPU"; } @@ -3477,8 +3684,8 @@ def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; } -class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction { - let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); +class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index df522a9099c0..abb72e8e63c3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -30,6 +30,7 @@ private: const SIInstrInfo *TII = nullptr; MachineDominatorTree *MDT = nullptr; + void expandChainCall(MachineInstr &MI); void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: @@ -116,6 +117,18 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MDT->getBase().applyUpdates(DTUpdates); } +void SILateBranchLowering::expandChainCall(MachineInstr &MI) { + // This is a tail call that needs to be expanded into at least + // 2 instructions, one for setting EXEC and one for the actual tail call. + constexpr unsigned ExecIdx = 3; + + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MovOpc), ExecReg) + ->addOperand(MI.getOperand(ExecIdx)); + MI.removeOperand(ExecIdx); + + MI.setDesc(TII->get(AMDGPU::SI_TCRETURN)); +} + void SILateBranchLowering::earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock) { MachineBasicBlock &MBB = *MI.getParent(); @@ -158,6 +171,12 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { } break; + case AMDGPU::SI_CS_CHAIN_TC_W32: + case AMDGPU::SI_CS_CHAIN_TC_W64: + expandChainCall(MI); + MadeChange = true; + break; + case AMDGPU::SI_EARLY_TERMINATE_SCC0: EarlyTermInstrs.push_back(&MI); break; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c252d30e250e..9c85ff3c43e2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { if (!AddrOp->isReg()) return false; - // TODO: We should be able to merge physical reg addresses. - if (AddrOp->getReg().isPhysical()) + // TODO: We should be able to merge instructions with other physical reg + // addresses too. + if (AddrOp->getReg().isPhysical() && + AddrOp->getReg() != AMDGPU::SGPR_NULL) return false; // If an address has only one use then there will be no other @@ -320,7 +322,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { // FIXME: Handle d16 correctly return AMDGPU::getMUBUFElements(Opc); } - if (TII.isMIMG(MI)) { + if (TII.isImage(MI)) { uint64_t DMaskImm = TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); return llvm::popcount(DMaskImm); @@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_LOAD_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX2: return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -398,15 +403,23 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: return BUFFER_LOAD; case AMDGPU::BUFFER_STORE_DWORD_OFFEN: case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: case AMDGPU::BUFFER_STORE_DWORD_OFFSET: case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: + case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: return BUFFER_STORE; } } - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { // Ignore instructions encoded without vaddr. if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) @@ -424,35 +437,50 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { default: return UNKNOWN; + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: - case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: return TBUFFER_LOAD; case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: + case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: return TBUFFER_STORE; } } return UNKNOWN; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return S_BUFFER_LOAD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return S_LOAD_IMM; @@ -505,7 +533,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { default: if (TII.isMUBUF(Opc)) return AMDGPU::getMUBUFBaseOpcode(Opc); - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); assert(Info); return Info->BaseOpcode; @@ -524,16 +552,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { return Opc; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return AMDGPU::S_LOAD_DWORD_IMM; @@ -600,11 +631,13 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { return Result; } - if (TII.isMIMG(Opc)) { + if (TII.isImage(Opc)) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); if (VAddr0Idx >= 0) { - int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); - Result.NumVAddrs = SRsrcIdx - VAddr0Idx; + int RsrcName = + TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; + int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); + Result.NumVAddrs = RsrcIdx - VAddr0Idx; } else { Result.VAddr = true; } @@ -631,16 +664,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { return Result; case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: Result.SOffset = true; [[fallthrough]]; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: Result.SBase = true; @@ -739,6 +775,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } AddressRegs Regs = getRegs(Opc, *LSO.TII); + bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); NumAddresses = 0; for (unsigned J = 0; J < Regs.NumVAddrs; J++) @@ -751,8 +788,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); if (Regs.SRsrc) - AddrIdx[NumAddresses++] = - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( + Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); if (Regs.SOffset) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); @@ -763,8 +800,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); if (Regs.SSamp) - AddrIdx[NumAddresses++] = - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); + AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( + Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); assert(NumAddresses <= MaxAddressRegs); for (unsigned J = 0; J < NumAddresses; J++) @@ -871,6 +908,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, unsigned MaxMask = std::max(CI.DMask, Paired.DMask); unsigned MinMask = std::min(CI.DMask, Paired.DMask); + if (!MaxMask) + return false; + unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); if ((1u << AllowedBitsForMin) <= MinMask) return false; @@ -964,6 +1004,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, return false; if (CI.CPol != Paired.CPol) return false; + if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || + CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { + // Reject cases like: + // dword + dwordx2 -> dwordx3 + // dword + dwordx3 -> dwordx4 + // If we tried to combine these cases, we would fail to extract a subreg + // for the result of the second load due to SGPR alignment requirements. + if (CI.Width != Paired.Width && + (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) + return false; + } return true; } @@ -1043,6 +1094,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, case 4: case 8: return true; + case 3: + return STM.hasScalarDwordx3Loads(); } } } @@ -1671,6 +1724,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; case 8: @@ -1682,6 +1737,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + case 3: + return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: @@ -1693,6 +1750,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return 0; case 2: return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; case 4: return AMDGPU::S_LOAD_DWORDX4_IMM; case 8: @@ -1814,6 +1873,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, return nullptr; case 2: return &AMDGPU::SReg_64_XEXECRegClass; + case 3: + return &AMDGPU::SGPR_96RegClass; case 4: return &AMDGPU::SGPR_128RegClass; case 8: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 00cb5b2878f4..f178324dbbe2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -79,6 +79,7 @@ private: SetVector<MachineInstr*> LoweredEndCf; DenseSet<Register> LoweredIf; SmallSet<MachineBasicBlock *, 4> KillBlocks; + SmallSet<Register, 8> RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; @@ -297,8 +298,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // FIXME: Is there a better way of adjusting the liveness? It shouldn't be // hard to add another def here but I'm not sure how to correctly update the // valno. - LIS->removeInterval(SaveExecReg); - LIS->createAndComputeVirtRegInterval(SaveExecReg); + RecomputeRegs.insert(SaveExecReg); LIS->createAndComputeVirtRegInterval(Tmp); if (!SimpleIf) LIS->createAndComputeVirtRegInterval(CopyReg); @@ -309,6 +309,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { const DebugLoc &DL = MI.getDebugLoc(); Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); MachineBasicBlock::iterator Start = MBB.begin(); @@ -319,7 +320,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) .add(MI.getOperand(1)); // Saved EXEC if (LV) - LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec); + LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec); MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); @@ -331,9 +332,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(Exec) .addReg(SaveReg); - if (LIS) - LIS->InsertMachineInstrInMaps(*And); - MachineInstr *Xor = BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) .addReg(Exec) @@ -356,12 +354,13 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MI.eraseFromParent(); LIS->InsertMachineInstrInMaps(*OrSaveExec); + LIS->InsertMachineInstrInMaps(*And); LIS->InsertMachineInstrInMaps(*Xor); LIS->InsertMachineInstrInMaps(*Branch); - LIS->removeInterval(DstReg); - LIS->createAndComputeVirtRegInterval(DstReg); + RecomputeRegs.insert(SrcReg); + RecomputeRegs.insert(DstReg); LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. @@ -388,8 +387,9 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { // AND the break condition operand with exec, then OR that into the "loop // exit" mask. MachineInstr *And = nullptr, *Or = nullptr; + Register AndReg; if (!SkipAnding) { - Register AndReg = MRI->createVirtualRegister(BoolRC); + AndReg = MRI->createVirtualRegister(BoolRC); And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) .addReg(Exec) .add(MI.getOperand(1)); @@ -398,8 +398,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .addReg(AndReg) .add(MI.getOperand(2)); - if (LIS) - LIS->createAndComputeVirtRegInterval(AndReg); } else { Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) .add(MI.getOperand(1)) @@ -411,9 +409,13 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or); if (LIS) { - if (And) - LIS->InsertMachineInstrInMaps(*And); LIS->ReplaceMachineInstrInMaps(MI, *Or); + if (And) { + // Read of original operand 1 is on And now not Or. + RecomputeRegs.insert(And->getOperand(2).getReg()); + LIS->InsertMachineInstrInMaps(*And); + LIS->createAndComputeVirtRegInterval(AndReg); + } } MI.eraseFromParent(); @@ -436,6 +438,7 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { .add(MI.getOperand(1)); if (LIS) { + RecomputeRegs.insert(MI.getOperand(0).getReg()); LIS->ReplaceMachineInstrInMaps(MI, *AndN2); LIS->InsertMachineInstrInMaps(*Branch); } @@ -714,11 +717,13 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { // This should be before all vector instructions. - BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) .addImm(MI.getOperand(0).getImm()); - if (LIS) + if (LIS) { LIS->RemoveMachineInstrFromMaps(MI); + LIS->InsertMachineInstrInMaps(*InitMI); + } MI.eraseFromParent(); return; } @@ -789,8 +794,7 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, LIS->InsertMachineInstrInMaps(*CmpMI); LIS->InsertMachineInstrInMaps(*CmovMI); - LIS->removeInterval(InputReg); - LIS->createAndComputeVirtRegInterval(InputReg); + RecomputeRegs.insert(InputReg); LIS->createAndComputeVirtRegInterval(CountReg); } @@ -807,7 +811,7 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { while (!MBB.predecessors().empty()) { MachineBasicBlock *P = *MBB.pred_begin(); - if (P->getFallThrough() == &MBB) + if (P->getFallThrough(false) == &MBB) FallThrough = P; P->ReplaceUsesOfBlockWith(&MBB, Succ); } @@ -828,14 +832,13 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { MBB.clear(); MBB.eraseFromParent(); if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { - if (!Succ->canFallThrough()) { - MachineFunction *MF = FallThrough->getParent(); - MachineFunction::iterator FallThroughPos(FallThrough); - MF->splice(std::next(FallThroughPos), Succ); - } else - BuildMI(*FallThrough, FallThrough->end(), - FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(Succ); + // Note: we cannot update block layout and preserve live intervals; + // hence we must insert a branch. + MachineInstr *BranchMI = BuildMI(*FallThrough, FallThrough->end(), + FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(Succ); + if (LIS) + LIS->InsertMachineInstrInMaps(*BranchMI); } return true; @@ -845,8 +848,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - EnableOptimizeEndCf = - RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None; + EnableOptimizeEndCf = RemoveRedundantEndcf && + MF.getTarget().getOptLevel() > CodeGenOptLevel::None; // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); @@ -947,6 +950,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { optimizeEndCf(); + if (LIS) { + for (Register Reg : RecomputeRegs) { + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + } + + RecomputeRegs.clear(); LoweredEndCf.clear(); LoweredIf.clear(); KillBlocks.clear(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index d4f0906f020a..cfa0c21def79 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -21,21 +21,19 @@ // //===----------------------------------------------------------------------===// +#include "SILowerI1Copies.h" #include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/InitializePasses.h" +#include "llvm/Target/CGPassBuilderOption.h" #define DEBUG_TYPE "si-i1-copies" using namespace llvm; -static unsigned createLaneMaskReg(MachineFunction &MF); -static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); +static Register insertUndefLaneMask(MachineBasicBlock *MBB, + MachineRegisterInfo *MRI, + Register LaneMaskRegAttrs); namespace { @@ -43,26 +41,6 @@ class SILowerI1Copies : public MachineFunctionPass { public: static char ID; -private: - bool IsWave32 = false; - MachineFunction *MF = nullptr; - MachineDominatorTree *DT = nullptr; - MachinePostDominatorTree *PDT = nullptr; - MachineRegisterInfo *MRI = nullptr; - const GCNSubtarget *ST = nullptr; - const SIInstrInfo *TII = nullptr; - - unsigned ExecReg; - unsigned MovOp; - unsigned AndOp; - unsigned OrOp; - unsigned XorOp; - unsigned AndN2Op; - unsigned OrN2Op; - - DenseSet<unsigned> ConstrainRegs; - -public: SILowerI1Copies() : MachineFunctionPass(ID) { initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); } @@ -77,29 +55,53 @@ public: AU.addRequired<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } +}; + +class Vreg1LoweringHelper : public PhiLoweringHelper { +public: + Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT); private: - bool lowerCopiesFromI1(); - bool lowerPhis(); - bool lowerCopiesToI1(); - bool isConstantLaneMask(Register Reg, bool &Val) const; + DenseSet<Register> ConstrainRegs; + +public: + void markAsLaneMask(Register DstReg) const override; + void getCandidatesForLowering( + SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; + void collectIncomingValuesFromPhi( + const MachineInstr *MI, + SmallVectorImpl<Incoming> &Incomings) const override; + void replaceDstReg(Register NewReg, Register OldReg, + MachineBasicBlock *MBB) override; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DstReg, unsigned PrevReg, unsigned CurReg); - MachineBasicBlock::iterator - getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + Register DstReg, Register PrevReg, + Register CurReg) override; + void constrainIncomingRegisterTakenAsIs(Incoming &In) override; + bool lowerCopiesFromI1(); + bool lowerCopiesToI1(); + bool cleanConstrainRegs(bool Changed); bool isVreg1(Register Reg) const { return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; } - - bool isLaneMaskReg(unsigned Reg) const { - return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && - TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == - ST->getWavefrontSize(); - } }; +Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT) + : PhiLoweringHelper(MF, DT, PDT) {} + +bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { + assert(Changed || ConstrainRegs.empty()); + for (Register Reg : ConstrainRegs) + MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + ConstrainRegs.clear(); + + return Changed; +} + /// Helper class that determines the relationship between incoming values of a /// phi in the control flow graph to determine where an incoming value can /// simply be taken as a scalar lane mask as-is, and where it needs to be @@ -145,8 +147,7 @@ public: ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } - void analyze(MachineBasicBlock &DefBlock, - ArrayRef<MachineBasicBlock *> IncomingBlocks) { + void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) { assert(Stack.empty()); ReachableMap.clear(); ReachableOrdered.clear(); @@ -157,7 +158,8 @@ public: ReachableMap.try_emplace(&DefBlock, false); ReachableOrdered.push_back(&DefBlock); - for (MachineBasicBlock *MBB : IncomingBlocks) { + for (auto Incoming : Incomings) { + MachineBasicBlock *MBB = Incoming.Block; if (MBB == &DefBlock) { ReachableMap[&DefBlock] = true; // self-loop on DefBlock continue; @@ -302,34 +304,38 @@ public: /// blocks, so that the SSA updater doesn't have to search all the way to the /// function entry. void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, - ArrayRef<MachineBasicBlock *> Blocks = {}) { + MachineRegisterInfo &MRI, Register LaneMaskRegAttrs, + ArrayRef<Incoming> Incomings = {}) { assert(LoopLevel < CommonDominators.size()); MachineBasicBlock *Dom = CommonDominators[LoopLevel]; - for (MachineBasicBlock *MBB : Blocks) - Dom = DT.findNearestCommonDominator(Dom, MBB); + for (auto &Incoming : Incomings) + Dom = DT.findNearestCommonDominator(Dom, Incoming.Block); - if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { - SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + if (!inLoopLevel(*Dom, LoopLevel, Incomings)) { + SSAUpdater.AddAvailableValue( + Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs)); } else { // The dominator is part of the loop or the given blocks, so add the // undef value to unreachable predecessors instead. for (MachineBasicBlock *Pred : Dom->predecessors()) { - if (!inLoopLevel(*Pred, LoopLevel, Blocks)) - SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + if (!inLoopLevel(*Pred, LoopLevel, Incomings)) + SSAUpdater.AddAvailableValue( + Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs)); } } } private: bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, - ArrayRef<MachineBasicBlock *> Blocks) const { + ArrayRef<Incoming> Incomings) const { auto DomIt = Visited.find(&MBB); if (DomIt != Visited.end() && DomIt->second <= LoopLevel) return true; - if (llvm::is_contained(Blocks, &MBB)) - return true; + for (auto &Incoming : Incomings) + if (Incoming.Block == &MBB) + return true; return false; } @@ -405,19 +411,19 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { return new SILowerI1Copies(); } -static unsigned createLaneMaskReg(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); +Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI, + Register LaneMaskRegAttrs) { + return MRI->cloneVirtualRegister(LaneMaskRegAttrs); } -static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { - MachineFunction &MF = *MBB.getParent(); +static Register insertUndefLaneMask(MachineBasicBlock *MBB, + MachineRegisterInfo *MRI, + Register LaneMaskRegAttrs) { + MachineFunction &MF = *MBB->getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned UndefReg = createLaneMaskReg(MF); - BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), + Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); + BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), UndefReg); return UndefReg; } @@ -434,47 +440,17 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { // Only need to run this in SelectionDAG path. if (TheMF.getProperties().hasProperty( - MachineFunctionProperties::Property::Selected)) + MachineFunctionProperties::Property::Selected)) return false; - MF = &TheMF; - MRI = &MF->getRegInfo(); - DT = &getAnalysis<MachineDominatorTree>(); - PDT = &getAnalysis<MachinePostDominatorTree>(); - - ST = &MF->getSubtarget<GCNSubtarget>(); - TII = ST->getInstrInfo(); - IsWave32 = ST->isWave32(); - - if (IsWave32) { - ExecReg = AMDGPU::EXEC_LO; - MovOp = AMDGPU::S_MOV_B32; - AndOp = AMDGPU::S_AND_B32; - OrOp = AMDGPU::S_OR_B32; - XorOp = AMDGPU::S_XOR_B32; - AndN2Op = AMDGPU::S_ANDN2_B32; - OrN2Op = AMDGPU::S_ORN2_B32; - } else { - ExecReg = AMDGPU::EXEC; - MovOp = AMDGPU::S_MOV_B64; - AndOp = AMDGPU::S_AND_B64; - OrOp = AMDGPU::S_OR_B64; - XorOp = AMDGPU::S_XOR_B64; - AndN2Op = AMDGPU::S_ANDN2_B64; - OrN2Op = AMDGPU::S_ORN2_B64; - } + Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(), + &getAnalysis<MachinePostDominatorTree>()); bool Changed = false; - Changed |= lowerCopiesFromI1(); - Changed |= lowerPhis(); - Changed |= lowerCopiesToI1(); - - assert(Changed || ConstrainRegs.empty()); - for (unsigned Reg : ConstrainRegs) - MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); - ConstrainRegs.clear(); - - return Changed; + Changed |= Helper.lowerCopiesFromI1(); + Changed |= Helper.lowerPhis(); + Changed |= Helper.lowerCopiesToI1(); + return Helper.cleanConstrainRegs(Changed); } #ifndef NDEBUG @@ -486,7 +462,7 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, } #endif -bool SILowerI1Copies::lowerCopiesFromI1() { +bool Vreg1LoweringHelper::lowerCopiesFromI1() { bool Changed = false; SmallVector<MachineInstr *, 4> DeadCopies; @@ -529,27 +505,47 @@ bool SILowerI1Copies::lowerCopiesFromI1() { return Changed; } -bool SILowerI1Copies::lowerPhis() { +PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT) + : MF(MF), DT(DT), PDT(PDT) { + MRI = &MF->getRegInfo(); + + ST = &MF->getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); + IsWave32 = ST->isWave32(); + + if (IsWave32) { + ExecReg = AMDGPU::EXEC_LO; + MovOp = AMDGPU::S_MOV_B32; + AndOp = AMDGPU::S_AND_B32; + OrOp = AMDGPU::S_OR_B32; + XorOp = AMDGPU::S_XOR_B32; + AndN2Op = AMDGPU::S_ANDN2_B32; + OrN2Op = AMDGPU::S_ORN2_B32; + } else { + ExecReg = AMDGPU::EXEC; + MovOp = AMDGPU::S_MOV_B64; + AndOp = AMDGPU::S_AND_B64; + OrOp = AMDGPU::S_OR_B64; + XorOp = AMDGPU::S_XOR_B64; + AndN2Op = AMDGPU::S_ANDN2_B64; + OrN2Op = AMDGPU::S_ORN2_B64; + } +} + +bool PhiLoweringHelper::lowerPhis() { MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT, TII); SmallVector<MachineInstr *, 4> Vreg1Phis; - SmallVector<MachineBasicBlock *, 4> IncomingBlocks; - SmallVector<unsigned, 4> IncomingRegs; - SmallVector<unsigned, 4> IncomingUpdated; -#ifndef NDEBUG - DenseSet<unsigned> PhiRegisters; -#endif + SmallVector<Incoming, 4> Incomings; - for (MachineBasicBlock &MBB : *MF) { - for (MachineInstr &MI : MBB.phis()) { - if (isVreg1(MI.getOperand(0).getReg())) - Vreg1Phis.push_back(&MI); - } - } + getCandidatesForLowering(Vreg1Phis); if (Vreg1Phis.empty()) return false; + DT->getBase().updateDFSNumbers(); MachineBasicBlock *PrevMBB = nullptr; for (MachineInstr *MI : Vreg1Phis) { MachineBasicBlock &MBB = *MI->getParent(); @@ -561,29 +557,19 @@ bool SILowerI1Copies::lowerPhis() { LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); Register DstReg = MI->getOperand(0).getReg(); - MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); - - // Collect incoming values. - for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { - assert(i + 1 < MI->getNumOperands()); - Register IncomingReg = MI->getOperand(i).getReg(); - MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); - MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - - if (IncomingDef->getOpcode() == AMDGPU::COPY) { - IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); - assert(!IncomingDef->getOperand(1).getSubReg()); - } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { - continue; - } else { - assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); - } + markAsLaneMask(DstReg); + initializeLaneMaskRegisterAttributes(DstReg); - IncomingBlocks.push_back(IncomingMBB); - IncomingRegs.push_back(IncomingReg); - } + collectIncomingValuesFromPhi(MI, Incomings); + + // Sort the incomings such that incoming values that dominate other incoming + // values are sorted earlier. This allows us to do some amount of on-the-fly + // constant folding. + // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block. + llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) { + return DT->getNode(LHS.Block)->getDFSNumIn() < + DT->getNode(RHS.Block)->getDFSNumIn(); + }); #ifndef NDEBUG PhiRegisters.insert(DstReg); @@ -607,64 +593,63 @@ bool SILowerI1Copies::lowerPhis() { SSAUpdater.Initialize(DstReg); if (FoundLoopLevel) { - LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs, + Incomings); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(IncomingBlocks[i], - IncomingUpdated.back()); + for (auto &Incoming : Incomings) { + Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); + SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg); } - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; + for (auto &Incoming : Incomings) { + MachineBasicBlock &IMBB = *Incoming.Block; buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); } } else { // The phi is not observed from outside a loop. Use a more accurate // lowering. - PIA.analyze(MBB, IncomingBlocks); + PIA.analyze(MBB, Incomings); for (MachineBasicBlock *MBB : PIA.predecessors()) - SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + SSAUpdater.AddAvailableValue( + MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs)); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; + for (auto &Incoming : Incomings) { + MachineBasicBlock &IMBB = *Incoming.Block; if (PIA.isSource(IMBB)) { - IncomingUpdated.push_back(0); - SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + constrainIncomingRegisterTakenAsIs(Incoming); + SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg); } else { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); + Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); + SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg); } } - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - if (!IncomingUpdated[i]) + for (auto &Incoming : Incomings) { + if (!Incoming.UpdatedReg.isValid()) continue; - MachineBasicBlock &IMBB = *IncomingBlocks[i]; + MachineBasicBlock &IMBB = *Incoming.Block; buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg, + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg); } } Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); if (NewReg != DstReg) { - MRI->replaceRegWith(NewReg, DstReg); + replaceDstReg(NewReg, DstReg, &MBB); MI->eraseFromParent(); } - IncomingBlocks.clear(); - IncomingRegs.clear(); - IncomingUpdated.clear(); + Incomings.clear(); } return true; } -bool SILowerI1Copies::lowerCopiesToI1() { +bool Vreg1LoweringHelper::lowerCopiesToI1() { bool Changed = false; MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); @@ -691,8 +676,9 @@ bool SILowerI1Copies::lowerCopiesToI1() { LLVM_DEBUG(dbgs() << "Lower Other: " << MI); - MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); + markAsLaneMask(DstReg); + initializeLaneMaskRegisterAttributes(DstReg); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) continue; @@ -702,12 +688,15 @@ bool SILowerI1Copies::lowerCopiesToI1() { if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); - unsigned TmpReg = createLaneMaskReg(*MF); + Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) .addReg(SrcReg) .addImm(0); MI.getOperand(1).setReg(TmpReg); SrcReg = TmpReg; + } else { + // SrcReg needs to be live beyond copy. + MI.getOperand(1).setIsKill(false); } // Defs in a loop that are observed outside the loop must be transformed @@ -722,7 +711,7 @@ bool SILowerI1Copies::lowerCopiesToI1() { if (FoundLoopLevel) { SSAUpdater.Initialize(DstReg); SSAUpdater.AddAvailableValue(&MBB, DstReg); - LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs); buildMergeLaneMasks(MBB, MI, DL, DstReg, SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); @@ -737,7 +726,7 @@ bool SILowerI1Copies::lowerCopiesToI1() { return Changed; } -bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { +bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { const MachineInstr *MI; for (;;) { MI = MRI->getUniqueVRegDef(Reg); @@ -790,7 +779,7 @@ static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { /// Return a point at the end of the given \p MBB to insert SALU instructions /// for lane mask calculation. Take terminators and SCC into account. MachineBasicBlock::iterator -SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { +PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { auto InsertionPt = MBB.getFirstTerminator(); bool TerminatorsUseSCC = false; for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { @@ -816,10 +805,53 @@ SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { llvm_unreachable("SCC used by terminator but no def in block"); } -void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, - unsigned PrevReg, unsigned CurReg) { +// VReg_1 -> SReg_32 or SReg_64 +void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { + MRI->setRegClass(DstReg, ST->getBoolRC()); +} + +void Vreg1LoweringHelper::getCandidatesForLowering( + SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB.phis()) { + if (isVreg1(MI.getOperand(0).getReg())) + Vreg1Phis.push_back(&MI); + } + } +} + +void Vreg1LoweringHelper::collectIncomingValuesFromPhi( + const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { + for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { + assert(i + 1 < MI->getNumOperands()); + Register IncomingReg = MI->getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); + + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); + } + + Incomings.emplace_back(IncomingReg, IncomingMBB, Register()); + } +} + +void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, + MachineBasicBlock *MBB) { + MRI->replaceRegWith(NewReg, OldReg); +} + +void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register DstReg, Register PrevReg, + Register CurReg) { bool PrevVal = false; bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); bool CurVal = false; @@ -838,13 +870,13 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, return; } - unsigned PrevMaskedReg = 0; - unsigned CurMaskedReg = 0; + Register PrevMaskedReg; + Register CurMaskedReg; if (!PrevConstant) { if (CurConstant && CurVal) { PrevMaskedReg = PrevReg; } else { - PrevMaskedReg = createLaneMaskReg(*MF); + PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) .addReg(PrevReg) .addReg(ExecReg); @@ -855,7 +887,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, if (PrevConstant && PrevVal) { CurMaskedReg = CurReg; } else { - CurMaskedReg = createLaneMaskReg(*MF); + CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) .addReg(CurReg) .addReg(ExecReg); @@ -878,3 +910,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); } } + +void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) { + return; +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h new file mode 100644 index 000000000000..5099d39c2d14 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h @@ -0,0 +1,97 @@ +//===-- SILowerI1Copies.h --------------------------------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Interface definition of the PhiLoweringHelper class that implements lane +/// mask merging algorithm for divergent i1 phis. +// +//===----------------------------------------------------------------------===// + +#include "GCNSubtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" + +namespace llvm { + +/// Incoming for lane maks phi as machine instruction, incoming register \p Reg +/// and incoming block \p Block are taken from machine instruction. +/// \p UpdatedReg (if valid) is \p Reg lane mask merged with another lane mask. +struct Incoming { + Register Reg; + MachineBasicBlock *Block; + Register UpdatedReg; + + Incoming(Register Reg, MachineBasicBlock *Block, Register UpdatedReg) + : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {} +}; + +Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs); + +class PhiLoweringHelper { +public: + PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT); + virtual ~PhiLoweringHelper() = default; + +protected: + bool IsWave32 = false; + MachineFunction *MF = nullptr; + MachineDominatorTree *DT = nullptr; + MachinePostDominatorTree *PDT = nullptr; + MachineRegisterInfo *MRI = nullptr; + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + Register LaneMaskRegAttrs; + +#ifndef NDEBUG + DenseSet<Register> PhiRegisters; +#endif + + Register ExecReg; + unsigned MovOp; + unsigned AndOp; + unsigned OrOp; + unsigned XorOp; + unsigned AndN2Op; + unsigned OrN2Op; + +public: + bool lowerPhis(); + bool isConstantLaneMask(Register Reg, bool &Val) const; + MachineBasicBlock::iterator + getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + + void initializeLaneMaskRegisterAttributes(Register LaneMask) { + LaneMaskRegAttrs = LaneMask; + } + + bool isLaneMaskReg(Register Reg) const { + return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && + TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == + ST->getWavefrontSize(); + } + + // Helpers from lowerPhis that are different between sdag and global-isel. + + virtual void markAsLaneMask(Register DstReg) const = 0; + virtual void getCandidatesForLowering( + SmallVectorImpl<MachineInstr *> &Vreg1Phis) const = 0; + virtual void + collectIncomingValuesFromPhi(const MachineInstr *MI, + SmallVectorImpl<Incoming> &Incomings) const = 0; + virtual void replaceDstReg(Register NewReg, Register OldReg, + MachineBasicBlock *MBB) = 0; + virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, Register DstReg, + Register PrevReg, Register CurReg) = 0; + virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0; +}; + +} // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index d21107c02ef7..0ba7792ac436 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -50,7 +50,9 @@ public: SILowerSGPRSpills() : MachineFunctionPass(ID) {} void calculateSaveRestoreBlocks(MachineFunction &MF); - bool spillCalleeSavedRegs(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF, + SmallVectorImpl<int> &CalleeSavedFIs); + void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); bool runOnMachineFunction(MachineFunction &MF) override; @@ -58,6 +60,13 @@ public: AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::NoVRegs); + } }; } // end anonymous namespace @@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) { EntryBB.sortUniqueLiveIns(); } -bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { +bool SILowerSGPRSpills::spillCalleeSavedRegs( + MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) { MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { TRI->getSpillAlign(*RC), true); CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + CalleeSavedFIs.push_back(JunkFI); } } @@ -248,6 +259,52 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } +void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, + LiveIntervals *LIS) { + // TODO: This is a workaround to avoid the unmodelled liveness computed with + // whole-wave virtual registers when allocated together with the regular VGPR + // virtual registers. Presently, the liveness computed during the regalloc is + // only uniform (or single lane aware) and it doesn't take account of the + // divergent control flow that exists for our GPUs. Since the WWM registers + // can modify inactive lanes, the wave-aware liveness should be computed for + // the virtual registers to accurately plot their interferences. Without + // having the divergent CFG for the function, it is difficult to implement the + // wave-aware liveness info. Until then, we conservatively extend the liveness + // of the wwm registers into the entire function so that they won't be reused + // without first spilling/splitting their liveranges. + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); + auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); + // Set SGPR_SPILL asm printer flag + MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MIB); + } + } + } + + // Insert the KILL in the return blocks to extend their liveness untill the + // end of function. Insert a separate KILL for each VGPR. + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { + MachineBasicBlock::iterator InsertBefore = + RestoreBlock->getFirstTerminator(); + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + auto MIB = + BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(TargetOpcode::KILL)); + MIB.addReg(Reg); + if (LIS) + LIS->InsertMachineInstrInMaps(*MIB); + } + } +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -261,7 +318,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // First, expose any CSR SGPR spills. This is mostly the same as what PEI // does, but somewhat simpler. calculateSaveRestoreBlocks(MF); - bool HasCSRs = spillCalleeSavedRegs(MF); + SmallVector<int> CalleeSavedFIs; + bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -275,6 +333,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; bool NewReservedRegs = false; + bool SpilledToVirtVGPRLanes = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -297,23 +356,51 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { - NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( - MI, FI, nullptr, Indexes, LIS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - SpillFIs.set(FI); + + bool IsCalleeSaveSGPRSpill = llvm::is_contained(CalleeSavedFIs, FI); + if (IsCalleeSaveSGPRSpill) { + // Spill callee-saved SGPRs into physical VGPR lanes. + + // TODO: This is to ensure the CFIs are static for efficient frame + // unwinding in the debugger. Spilling them into virtual VGPR lanes + // involve regalloc to allocate the physical VGPRs and that might + // cause intermediate spill/split of such liveranges for successful + // allocation. This would result in broken CFI encoding unless the + // regalloc aware CFI generation to insert new CFIs along with the + // intermediate spills is implemented. There is no such support + // currently exist in the LLVM compiler. + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { + NewReservedRegs = true; + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS, true); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to physical VGPR lane when allocated"); + } + } else { + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to virtual VGPR lane when allocated"); + SpillFIs.set(FI); + SpilledToVirtVGPRLanes = true; + } } } } - // FIXME: Adding to live-ins redundant with reserving registers. - for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + if (SpilledToVirtVGPRLanes) { + extendWWMVirtRegLiveness(MF, LIS); + if (LIS) { + // Compute the LiveInterval for the newly created virtual registers. + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) + LIS->createAndComputeVirtRegInterval(Reg); + } + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -334,6 +421,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); + MadeChange = true; + } + + if (SpilledToVirtVGPRLanes) { const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); // Shift back the reserved SGPR for EXEC copy into the lowest range. // This SGPR is reserved to handle the whole-wave spill/copy operations @@ -342,20 +433,21 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); - - MadeChange = true; } else { - // No SGPR spills and hence there won't be any WWM spills/copies. Reset the - // SGPR reserved for EXEC copy. + // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM + // spills/copies. Reset the SGPR reserved for EXEC copy. FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any VGPRs added for SGPR spills. - if (NewReservedRegs) - MRI.freezeReservedRegs(MF); + // Updated the reserved registers with any physical VGPRs added for SGPR + // spills. + if (NewReservedRegs) { + for (Register Reg : FuncInfo->getWWMReservedRegs()) + MRI.reserveReg(Reg, TRI); + } return MadeChange; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp new file mode 100644 index 000000000000..9c3cd1bbd6b0 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp @@ -0,0 +1,141 @@ +//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lowering the WWM_COPY instructions for various register classes. +/// AMDGPU target generates WWM_COPY instruction to differentiate WWM +/// copy from COPY. This pass generates the necessary exec mask manipulation +/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to +/// COPY. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-wwm-copies" + +namespace { + +class SILowerWWMCopies : public MachineFunctionPass { +public: + static char ID; + + SILowerWWMCopies() : MachineFunctionPass(ID) { + initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Lower WWM Copies"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool isSCCLiveAtMI(const MachineInstr &MI); + void addToWWMSpills(MachineFunction &MF, Register Reg); + + LiveIntervals *LIS; + SlotIndexes *Indexes; + VirtRegMap *VRM; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + SIMachineFunctionInfo *MFI; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false, + false) + +char SILowerWWMCopies::ID = 0; + +char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID; + +bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills +// for preserving its entire lanes at function prolog/epilog. +void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) { + if (Reg.isPhysical()) + return; + + Register PhysReg = VRM->getPhys(Reg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "should have allocated a physical register"); + + MFI->allocateWWMSpill(MF, PhysReg); +} + +bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MFI = MF.getInfo<SIMachineFunctionInfo>(); + LIS = getAnalysisIfAvailable<LiveIntervals>(); + Indexes = getAnalysisIfAvailable<SlotIndexes>(); + VRM = getAnalysisIfAvailable<VirtRegMap>(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + + if (!MFI->hasVRegFlags()) + return false; + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::WWM_COPY) + continue; + + // TODO: Club adjacent WWM ops between same exec save/restore + assert(TII->isVGPRCopy(MI)); + + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + addToWWMSpills(MF, MI.getOperand(0).getReg()); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + + // Lower WWM_COPY back to COPY + MI.setDesc(TII->get(AMDGPU::COPY)); + Changed |= true; + } + } + + return Changed; +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c9376d0ea653..e8142244b7db 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -7,17 +7,18 @@ //===----------------------------------------------------------------------===// #include "SIMachineFunctionInfo.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" -#include "SIRegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -36,28 +37,12 @@ const GCNTargetMachine &getTM(const GCNSubtarget *STI) { SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) - : AMDGPUMachineFunction(F, *STI), - Mode(F), - GWSResourcePSV(getTM(STI)), - PrivateSegmentBuffer(false), - DispatchPtr(false), - QueuePtr(false), - KernargSegmentPtr(false), - DispatchID(false), - FlatScratchInit(false), - WorkGroupIDX(false), - WorkGroupIDY(false), - WorkGroupIDZ(false), - WorkGroupInfo(false), - LDSKernelId(false), - PrivateSegmentWaveByteOffset(false), - WorkItemIDX(false), - WorkItemIDY(false), - WorkItemIDZ(false), - ImplicitBufferPtr(false), - ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), + UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), + WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), + WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), + GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -67,16 +52,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, VRegFlags.reserve(1024); - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; if (IsKernel) { - if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0) - KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -85,7 +64,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MayNeedAGPRs = ST.hasMAIInsts(); - if (!isEntryFunction()) { + if (AMDGPU::isChainCC(CC)) { + // Chain functions don't receive an SP from their caller, but are free to + // set one up. For now, we can use s32 to match what amdgpu_gfx functions + // would use if called, but this can be revisited. + // FIXME: Only reserve this if we actually need it. + StackPtrOffsetReg = AMDGPU::SGPR32; + + ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; + + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + + ImplicitArgPtr = false; + } else if (!isEntryFunction()) { if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; @@ -115,12 +107,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, MayNeedAGPRs = false; // We will select all MAI with VGPR operands. } - bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (isAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - if (!AMDGPU::isGraphics(CC) || (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) @@ -145,33 +131,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id")) LDSKernelId = true; } - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. @@ -280,12 +243,47 @@ Register SIMachineFunctionInfo::addLDSKernelId() { return ArgInfo.LDSKernelId.getRegister(); } +SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg( + const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { + assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && + "Preload kernel argument allocated twice."); + NumUserSGPRs += PaddingSGPRs; + // If the available register tuples are aligned with the kernarg to be + // preloaded use that register, otherwise we need to use a set of SGPRs and + // merge them. + Register PreloadReg = + TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC); + if (PreloadReg && + (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg); + NumUserSGPRs += AllocSizeDWord; + } else { + for (unsigned I = 0; I < AllocSizeDWord; ++I) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR()); + NumUserSGPRs++; + } + } + + // Track the actual number of SGPRs that HW will preload to. + UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs); + return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; +} + void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size, Align Alignment) { // Skip if it is an entry function or the register is already added. if (isEntryFunction() || WWMSpills.count(VGPR)) return; + // Skip if this is a function with the amdgpu_cs_chain or + // amdgpu_cs_chain_preserve calling convention and this is a scratch register. + // We never need to allocate a spill for these because we don't even need to + // restore the inactive lanes for them (they're scratchier than the usual + // scratch registers). + if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR)) + return; + WWMSpills.insert(std::make_pair( VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); } @@ -314,37 +312,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } -bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, - int FI, - unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRSpillToVGPRLanes.erase(FI); - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } - SGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToVirtualVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } -bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( +bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -355,16 +339,22 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. - PrologEpilogSGPRSpillToVGPRLanes.erase(FI); + SGPRSpillsToPhysicalVGPRLanes.erase(FI); return false; } allocateWWMSpill(MF, LaneVGPR); + reserveWWMRegister(LaneVGPR); + for (MachineBasicBlock &MBB : MF) { + MBB.addLiveIn(LaneVGPR); + MBB.sortUniqueLiveIns(); + } + SpillPhysVGPRs.push_back(LaneVGPR); } else { - LaneVGPR = WWMSpills.back().first; + LaneVGPR = SpillPhysVGPRs.back(); } - PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToPhysicalVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } @@ -373,8 +363,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool IsPrologEpilog) { std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = - IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] - : SGPRSpillToVGPRLanes[FI]; + IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -395,15 +385,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, "not spilling SGPRs to VGPRs"); unsigned &NumSpillLanes = - IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; + IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = - IsPrologEpilog - ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) - : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + bool Allocated = IsPrologEpilog + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; return false; @@ -484,16 +473,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame. And also make sure to remove - // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it - // could result in an unexpected side effect and bug, in case of any - // re-mapping of freed frame indices by later pass(es) like "stack slot + // Remove dead frame indices from function frame, however keep FP & BP since + // spills for them haven't been inserted yet. And also make sure to remove the + // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, + // otherwise, it could result in an unexpected side effect and bug, in case of + // any re-mapping of freed frame indices by later pass(es) like "stack slot // coloring". - for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { + for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) { MFI.RemoveStackObject(R.first); - SGPRSpillToVGPRLanes.erase(R.first); + SGPRSpillsToVirtualVGPRLanes.erase(R.first); } + // Remove the dead frame indices of CSR SGPRs which are spilled to physical + // VGPR lanes during SILowerSGPRSpills pass. + if (!ResetSGPRSpillStackIDs) { + for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) { + MFI.RemoveStackObject(R.first); + SGPRSpillsToPhysicalVGPRLanes.erase(R.first); + } + } bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { @@ -522,7 +520,7 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI) { if (ScavengeFI) return *ScavengeFI; - if (isEntryFunction()) { + if (isBottomOfStack()) { ScavengeFI = MFI.CreateFixedObject( TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); } else { @@ -608,6 +606,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, return true; }; + // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); @@ -730,7 +729,7 @@ bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const { for (const auto &CI : IA->ParseConstraints()) { for (StringRef Code : CI.Codes) { Code.consume_front("{"); - if (Code.startswith("a")) + if (Code.starts_with("a")) return true; } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 3b4747adf125..dc63ae44c528 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,10 +16,12 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIModeRegisterDefaults.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/raw_ostream.h" @@ -256,6 +258,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { uint32_t GDSSize = 0; Align DynLDSAlign; bool IsEntryFunction = false; + bool IsChainFunction = false; bool NoSignedZerosFPMath = false; bool MemoryBound = false; bool WaveLimiter = false; @@ -304,6 +307,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("gdsSize", MFI.GDSSize, 0u); YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align()); YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); + YamlIO.mapOptional("isChainFunction", MFI.IsChainFunction, false); YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false); YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); @@ -434,13 +438,9 @@ private: unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; - // Feature bits required for inputs passed in user SGPRs. - bool PrivateSegmentBuffer : 1; - bool DispatchPtr : 1; - bool QueuePtr : 1; - bool KernargSegmentPtr : 1; - bool DispatchID : 1; - bool FlatScratchInit : 1; + // Tracks information about user SGPRs that will be setup by hardware which + // will apply to all wavefronts of the grid. + GCNUserSGPRUsageInfo UserSGPRInfo; // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. @@ -454,11 +454,6 @@ private: bool WorkItemIDY : 1; bool WorkItemIDZ : 1; - // Private memory buffer - // Compute directly in sgpr[0:1] - // Other shaders indirect 64-bits at sgpr[0:1] - bool ImplicitBufferPtr : 1; - // Pointer to where the ABI inserts special kernel arguments separate from the // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; @@ -496,16 +491,18 @@ public: }; private: - // To track VGPR + lane index for each subregister of the SGPR spilled to - // frameindex key during SILowerSGPRSpills pass. - DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes; - // To track VGPR + lane index for spilling special SGPRs like Frame Pointer - // identified during PrologEpilogInserter. + // To track virtual VGPR + lane index for each subregister of the SGPR spilled + // to frameindex key during SILowerSGPRSpills pass. + DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> + SGPRSpillsToVirtualVGPRLanes; + // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs + // like Frame Pointer identified during PrologEpilogInserter. DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> - PrologEpilogSGPRSpillToVGPRLanes; - unsigned NumVGPRSpillLanes = 0; - unsigned NumVGPRPrologEpilogSpillLanes = 0; + SGPRSpillsToPhysicalVGPRLanes; + unsigned NumVirtualVGPRSpillLanes = 0; + unsigned NumPhysicalVGPRSpillLanes = 0; SmallVector<Register, 2> SpillVGPRs; + SmallVector<Register, 2> SpillPhysVGPRs; using WWMSpillsMap = MapVector<Register, int>; // To track the registers used in instructions that can potentially modify the // inactive lanes. The WWM instructions and the writelane instructions for @@ -548,10 +545,10 @@ private: private: Register VGPRForAGPRCopy; - bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); - bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); public: Register getVGPRForAGPRCopy() const { @@ -583,9 +580,9 @@ public: SIModeRegisterDefaults getMode() const { return Mode; } ArrayRef<SIRegisterInfo::SpilledReg> - getSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = SGPRSpillToVGPRLanes.find(FrameIndex); - return (I == SGPRSpillToVGPRLanes.end()) + getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToVirtualVGPRLanes.end()) ? ArrayRef<SIRegisterInfo::SpilledReg>() : ArrayRef(I->second); } @@ -598,6 +595,10 @@ public: return PrologEpilogSGPRSpills; } + GCNUserSGPRUsageInfo &getUserSGPRInfo() { return UserSGPRInfo; } + + const GCNUserSGPRUsageInfo &getUserSGPRInfo() const { return UserSGPRInfo; } + void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI) { PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI)); @@ -647,9 +648,9 @@ public: } ArrayRef<SIRegisterInfo::SpilledReg> - getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); - return (I == PrologEpilogSGPRSpillToVGPRLanes.end()) + getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToPhysicalVGPRLanes.end()) ? ArrayRef<SIRegisterInfo::SpilledReg>() : ArrayRef(I->second); } @@ -667,6 +668,8 @@ public: return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag; } + bool hasVRegFlags() { return VRegFlags.size(); } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -728,6 +731,10 @@ public: Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); Register addLDSKernelId(); + SmallVectorImpl<MCRegister> * + addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, + int PaddingSGPRs); /// Increment user SGPRs used for padding the argument list only. Register addReservedUserSGPR() { @@ -775,6 +782,8 @@ public: return ArgInfo.WorkGroupInfo.getRegister(); } + bool hasLDSKernelId() const { return LDSKernelId; } + // Add special VGPR inputs void setWorkItemIDX(ArgDescriptor Arg) { ArgInfo.WorkItemIDX = Arg; @@ -799,30 +808,6 @@ public: ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } - bool hasPrivateSegmentBuffer() const { - return PrivateSegmentBuffer; - } - - bool hasDispatchPtr() const { - return DispatchPtr; - } - - bool hasQueuePtr() const { - return QueuePtr; - } - - bool hasKernargSegmentPtr() const { - return KernargSegmentPtr; - } - - bool hasDispatchID() const { - return DispatchID; - } - - bool hasFlatScratchInit() const { - return FlatScratchInit; - } - bool hasWorkGroupIDX() const { return WorkGroupIDX; } @@ -839,8 +824,6 @@ public: return WorkGroupInfo; } - bool hasLDSKernelId() const { return LDSKernelId; } - bool hasPrivateSegmentWaveByteOffset() const { return PrivateSegmentWaveByteOffset; } @@ -861,10 +844,6 @@ public: return ImplicitArgPtr; } - bool hasImplicitBufferPtr() const { - return ImplicitBufferPtr; - } - AMDGPUFunctionArgInfo &getArgInfo() { return ArgInfo; } @@ -901,6 +880,10 @@ public: return NumUserSGPRs + NumSystemSGPRs; } + unsigned getNumKernargPreloadedSGPRs() const { + return UserSGPRInfo.getNumKernargPreloadSGPRs(); + } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index bc48f7b76c6d..10ec54d3317f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) + .addImm(WaitCntImmediate); Changed = true; } @@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, VMCnt ? 0 : getVmcntBitMask(IV), getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft)) + .addImm(WaitCntImmediate); Changed = true; } if (VSCnt) { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); Changed = true; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp index 413ef5d162a7..2684a1e3c335 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp @@ -7,20 +7,26 @@ //===----------------------------------------------------------------------===// #include "SIModeRegisterDefaults.h" +#include "GCNSubtarget.h" using namespace llvm; -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, + const GCNSubtarget &ST) { *this = getDefaultForCallingConv(F.getCallingConv()); - StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); - if (!IEEEAttr.empty()) - IEEE = IEEEAttr == "true"; + if (ST.hasIEEEMode()) { + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + } - StringRef DX10ClampAttr = - F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); - if (!DX10ClampAttr.empty()) - DX10Clamp = DX10ClampAttr == "true"; + if (ST.hasDX10ClampMode()) { + StringRef DX10ClampAttr = + F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; + } StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); @@ -36,3 +42,135 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { FP64FP16Denormals = DenormMode; } } + +using namespace AMDGPU; + +/// Combine f32 and f64 rounding modes into a combined rounding mode value. +static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val, + uint32_t HWFP64Val) { + return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset; +} + +static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal, + uint32_t HWF32Val, + uint32_t HWF64Val) { + uint32_t ModeVal = getModeRegisterRoundMode(HWF32Val, HWF64Val); + if (FltRoundsVal > TowardNegative) + FltRoundsVal -= ExtendedFltRoundOffset; + + uint32_t BitIndex = ModeVal << 2; + return static_cast<uint64_t>(FltRoundsVal) << BitIndex; +} + +// Encode FLT_ROUNDS value where the two rounding modes are the same and use a +// standard value +static constexpr uint64_t +encodeFltRoundsTableSame(AMDGPUFltRounds FltRoundsMode, uint32_t HWVal) { + return encodeFltRoundsTable(FltRoundsMode, HWVal, HWVal); +} + +// Convert mode register encoded rounding mode to AMDGPUFltRounds +static constexpr AMDGPUFltRounds +decodeIndexFltRoundConversionTable(uint32_t HWMode) { + uint32_t TableRead = (FltRoundConversionTable >> (HWMode << 2)) & 0xf; + if (TableRead > TowardNegative) + TableRead += ExtendedFltRoundOffset; + return static_cast<AMDGPUFltRounds>(TableRead); +} + +static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO; +static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST; +static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF; +static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF; + +const uint64_t AMDGPU::FltRoundConversionTable = + encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) | + encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64, + HWNearestTiesToEven) | + encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64, + HWTowardPositive) | + encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64, + HWTowardNegative) | + + encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero, + HWNearestTiesToEven) | + encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero, + HWTowardPositive) | + encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero, + HWTowardNegative) | + + encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64, + HWNearestTiesToEven, HWTowardZero) | + encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64, + HWNearestTiesToEven, HWTowardPositive) | + encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64, + HWNearestTiesToEven, HWTowardNegative) | + + encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive, + HWTowardZero) | + encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64, + HWTowardPositive, HWNearestTiesToEven) | + encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive, + HWTowardNegative) | + + encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative, + HWTowardZero) | + encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64, + HWTowardNegative, HWNearestTiesToEven) | + encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative, + HWTowardPositive); + +// Verify evaluation of FltRoundConversionTable + +// If both modes are the same, should return the standard values. +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWNearestTiesToEven, HWNearestTiesToEven)) == + AMDGPUFltRounds::NearestTiesToEven); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardPositive, HWTowardPositive)) == + AMDGPUFltRounds::TowardPositive); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardNegative, HWTowardNegative)) == + AMDGPUFltRounds::TowardNegative); + +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardZero, HWNearestTiesToEven)) == + TowardZeroF32_NearestTiesToEvenF64); +static_assert(decodeIndexFltRoundConversionTable( + getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) == + TowardZeroF32_TowardPositiveF64); +static_assert(decodeIndexFltRoundConversionTable( + getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) == + TowardZeroF32_TowardNegativeF64); + +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWNearestTiesToEven, HWTowardZero)) == + NearestTiesToEvenF32_TowardZeroF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWNearestTiesToEven, HWTowardPositive)) == + NearestTiesToEvenF32_TowardPositiveF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWNearestTiesToEven, HWTowardNegative)) == + NearestTiesToEvenF32_TowardNegativeF64); + +static_assert(decodeIndexFltRoundConversionTable( + getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) == + TowardPositiveF32_TowardZeroF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardPositive, HWNearestTiesToEven)) == + TowardPositiveF32_NearestTiesToEvenF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardPositive, HWTowardNegative)) == + TowardPositiveF32_TowardNegativeF64); + +static_assert(decodeIndexFltRoundConversionTable( + getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) == + TowardNegativeF32_TowardZeroF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardNegative, HWNearestTiesToEven)) == + TowardNegativeF32_NearestTiesToEvenF64); +static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode( + HWTowardNegative, HWTowardPositive)) == + TowardNegativeF32_TowardPositiveF64); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h index df2e3f9bff32..9fbd74c3eede 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h @@ -14,6 +14,8 @@ namespace llvm { +class GCNSubtarget; + // Track defaults for fields in the MODE register. struct SIModeRegisterDefaults { /// Floating point opcodes that support exception flag gathering quiet and @@ -40,7 +42,7 @@ struct SIModeRegisterDefaults { FP32Denormals(DenormalMode::getIEEE()), FP64FP16Denormals(DenormalMode::getIEEE()) {} - SIModeRegisterDefaults(const Function &F); + SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST); static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { SIModeRegisterDefaults Mode; @@ -85,6 +87,65 @@ struct SIModeRegisterDefaults { } }; +namespace AMDGPU { + +/// Return values used for llvm.get.rounding +/// +/// When both the F32 and F64/F16 modes are the same, returns the standard +/// values. If they differ, returns an extended mode starting at 8. +enum AMDGPUFltRounds : int8_t { + // Inherit everything from RoundingMode + TowardZero = static_cast<int8_t>(RoundingMode::TowardZero), + NearestTiesToEven = static_cast<int8_t>(RoundingMode::NearestTiesToEven), + TowardPositive = static_cast<int8_t>(RoundingMode::TowardPositive), + TowardNegative = static_cast<int8_t>(RoundingMode::TowardNegative), + NearestTiesToAwayUnsupported = + static_cast<int8_t>(RoundingMode::NearestTiesToAway), + + Dynamic = static_cast<int8_t>(RoundingMode::Dynamic), + + // Permute the mismatched rounding mode cases. If the modes are the same, use + // the standard values, otherwise, these values are sorted such that higher + // hardware encoded values have higher enum values. + NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven, + NearestTiesToEvenF32_TowardPositiveF64 = 8, + NearestTiesToEvenF32_TowardNegativeF64 = 9, + NearestTiesToEvenF32_TowardZeroF64 = 10, + + TowardPositiveF32_NearestTiesToEvenF64 = 11, + TowardPositiveF32_TowardPositiveF64 = TowardPositive, + TowardPositiveF32_TowardNegativeF64 = 12, + TowardPositiveF32_TowardZeroF64 = 13, + + TowardNegativeF32_NearestTiesToEvenF64 = 14, + TowardNegativeF32_TowardPositiveF64 = 15, + TowardNegativeF32_TowardNegativeF64 = TowardNegative, + TowardNegativeF32_TowardZeroF64 = 16, + + TowardZeroF32_NearestTiesToEvenF64 = 17, + TowardZeroF32_TowardPositiveF64 = 18, + TowardZeroF32_TowardNegativeF64 = 19, + TowardZeroF32_TowardZeroF64 = TowardZero, + + Invalid = static_cast<int8_t>(RoundingMode::Invalid) +}; + +/// Offset of nonstandard values for llvm.get.rounding results from the largest +/// supported mode. +static constexpr uint32_t ExtendedFltRoundOffset = 4; + +/// Offset in mode register of f32 rounding mode. +static constexpr uint32_t F32FltRoundOffset = 0; + +/// Offset in mode register of f64/f16 rounding mode. +static constexpr uint32_t F64FltRoundOffset = 2; + +// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS +// values. +extern const uint64_t FltRoundConversionTable; + +} // end namespace AMDGPU + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 04c9a6457944..e3f54d01eb22 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -10,6 +10,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineOperand.h" @@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass { DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors; + SmallVector<MachineOperand *, 1> KillFlagCandidates; Register isCopyFromExec(const MachineInstr &MI) const; Register isCopyToExec(const MachineInstr &MI) const; @@ -41,15 +43,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass { MachineBasicBlock::reverse_iterator findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I) const; - bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, MCRegister Reg, bool UseLiveOuts = false, bool IgnoreStart = false) const; bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; - MachineInstr *findInstrBackwards(MachineInstr &Origin, - std::function<bool(MachineInstr *)> Pred, - ArrayRef<MCRegister> NonModifiableRegs, - unsigned MaxInstructions = 20) const; + MachineInstr *findInstrBackwards( + MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, + MachineInstr *Terminator = nullptr, + SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr, + unsigned MaxInstructions = 20) const; bool optimizeExecSequence(); void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI); bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, @@ -325,11 +328,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either // the beginning of the BB is reached or Pred evaluates to true - which can be // an arbitrary condition based on the current MachineInstr, for instance an -// target instruction. Breaks prematurely by returning nullptr if one of the +// target instruction. Breaks prematurely by returning nullptr if one of the // registers given in NonModifiableRegs is modified by the current instruction. MachineInstr *SIOptimizeExecMasking::findInstrBackwards( MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, - ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { + ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator, + SmallVectorImpl<MachineOperand *> *KillFlagCandidates, + unsigned MaxInstructions) const { MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), E = Origin.getParent()->rend(); unsigned CurrentIteration = 0; @@ -344,6 +349,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards( for (MCRegister Reg : NonModifiableRegs) { if (A->modifiesRegister(Reg, TRI)) return nullptr; + + // Check for kills that appear after the terminator instruction, that + // would not be detected by clearKillFlags, since they will cause the + // register to be dead at a later place, causing the verifier to fail. + // We use the candidates to clear the kill flags later. + if (Terminator && KillFlagCandidates && A != Terminator && + A->killsRegister(Reg, TRI)) { + for (MachineOperand &MO : A->operands()) { + if (MO.isReg() && MO.isKill()) { + Register Candidate = MO.getReg(); + if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) + KillFlagCandidates->push_back(&MO); + } + } + } } ++CurrentIteration; @@ -599,6 +619,9 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( if (Src1->isReg()) MRI->clearKillFlags(Src1->getReg()); + for (MachineOperand *MO : KillFlagCandidates) + MO->setIsKill(false); + SaveExecInstr.eraseFromParent(); VCmp.eraseFromParent(); @@ -690,7 +713,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence( NonDefRegs.push_back(Src1->getReg()); if (!findInstrBackwards( - MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs)) + MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs, + VCmp, &KillFlagCandidates)) return; if (VCmp) @@ -777,6 +801,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { OrXors.clear(); SaveExecVCmpMapping.clear(); + KillFlagCandidates.clear(); static unsigned SearchWindow = 10; for (MachineBasicBlock &MBB : MF) { unsigned SearchCount = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index e95abae88d7a..8204a70e72d9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -522,9 +522,11 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange( auto *UseBlock = UseMI->getParent(); // Replace uses in Endif block if (UseBlock == Endif) { - if (UseMI->isPHI()) { + if (UseMI->isPHI()) O.setReg(NewReg); - } else { + else if (UseMI->isDebugInstr()) + continue; + else { // DetectDeadLanes may mark register uses as undef without removing // them, in which case a non-phi instruction using the original register // may exist in the Endif block even though the register is not live diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 97b3161c7f98..53fc2c068624 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -546,7 +546,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src1->isReg() || Src1->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || @@ -584,7 +585,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src1->isReg() || Src1->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || @@ -647,7 +649,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!Src0->isReg() || Src0->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; return std::make_unique<SDWASrcOperand>( @@ -675,7 +678,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) + if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() || + Dst->getReg().isPhysical()) break; return std::make_unique<SDWASrcOperand>( diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index c2ddfd7881ab..0c57110b4eb1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -28,6 +28,10 @@ using namespace llvm; #define DEBUG_TYPE "si-pre-allocate-wwm-regs" +static cl::opt<bool> + EnablePreallocateSGPRSpillVGPRs("amdgpu-prealloc-sgpr-spill-vgprs", + cl::init(false), cl::Hidden); + namespace { class SIPreAllocateWWMRegs : public MachineFunctionPass { @@ -56,11 +60,9 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); AU.addRequired<VirtRegMap>(); AU.addRequired<LiveRegMatrix>(); - AU.addPreserved<SlotIndexes>(); - AU.setPreservesCFG(); + AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -101,7 +103,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { LiveInterval &LI = LIS->getInterval(Reg); for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) { - if (!MRI->isPhysRegUsed(PhysReg) && + if (!MRI->isPhysRegUsed(PhysReg, /*SkipRegMaskTest=*/true) && Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) { Matrix->assign(LI, PhysReg); assert(PhysReg != 0); @@ -201,6 +203,10 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { RegClassInfo.runOnMachineFunction(MF); + bool PreallocateSGPRSpillVGPRs = + EnablePreallocateSGPRSpillVGPRs || + MF.getFunction().hasFnAttribute("amdgpu-prealloc-sgpr-spill-vgprs"); + bool RegsAssigned = false; // We use a reverse post-order traversal of the control-flow graph to @@ -217,6 +223,12 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) RegsAssigned |= processDef(MI.getOperand(0)); + if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) { + if (!PreallocateSGPRSpillVGPRs) + continue; + RegsAssigned |= processDef(MI.getOperand(0)); + } + if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM || MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 61444b14a56b..87242a4740c8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -320,6 +320,9 @@ bool SIPreEmitPeephole::mustRetainExeczBranch( if (MI.isConditionalBranch()) return true; + if (MI.isMetaInstruction()) + continue; + if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) return true; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index b6839c8308d8..9ed7aacc0538 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -15,27 +15,48 @@ // #include "SIProgramInfo.h" +#include "GCNSubtarget.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" using namespace llvm; -uint64_t SIProgramInfo::getComputePGMRSrc1() const { - return S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) | - S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) | - S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) | - S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode) | - S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered); +uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const { + uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) | + S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) | + S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) | + S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered); + + if (ST.hasDX10ClampMode()) + Reg |= S_00B848_DX10_CLAMP(DX10Clamp); + + if (ST.hasIEEEMode()) + Reg |= S_00B848_IEEE_MODE(IEEEMode); + + if (ST.hasRrWGMode()) + Reg |= S_00B848_RR_WG_MODE(RrWgMode); + + return Reg; } -uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const { +uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, + const GCNSubtarget &ST) const { if (AMDGPU::isCompute(CC)) { - return getComputePGMRSrc1(); + return getComputePGMRSrc1(ST); } uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) | S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) | - S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) | - S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode); + S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode); + + if (ST.hasDX10ClampMode()) + Reg |= S_00B848_DX10_CLAMP(DX10Clamp); + + if (ST.hasIEEEMode()) + Reg |= S_00B848_IEEE_MODE(IEEEMode); + + if (ST.hasRrWGMode()) + Reg |= S_00B848_RR_WG_MODE(RrWgMode); + switch (CC) { case CallingConv::AMDGPU_PS: Reg |= S_00B028_MEM_ORDERED(MemOrdered); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h index aab127e49463..8c26789f936c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -21,6 +21,8 @@ namespace llvm { +class GCNSubtarget; + /// Track resource usage for kernels / entry functions. struct SIProgramInfo { // Fields set in PGM_RSRC1 pm4 packet. @@ -34,6 +36,7 @@ struct SIProgramInfo { uint32_t IEEEMode = 0; uint32_t WgpMode = 0; // GFX10+ uint32_t MemOrdered = 0; // GFX10+ + uint32_t RrWgMode = 0; // GFX12+ uint64_t ScratchSize = 0; // State used to calculate fields set in PGM_RSRC2 pm4 packet. @@ -85,8 +88,8 @@ struct SIProgramInfo { SIProgramInfo() = default; /// Compute the value of the ComputePGMRsrc1 register. - uint64_t getComputePGMRSrc1() const; - uint64_t getPGMRSrc1(CallingConv::ID CC) const; + uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const; + uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const; /// Compute the value of the ComputePGMRsrc2 register. uint64_t getComputePGMRSrc2() const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index c2a272166241..021d797344c5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -19,7 +19,7 @@ #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -397,6 +397,8 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList : CSR_AMDGPU_SI_Gfx_SaveList; + case CallingConv::AMDGPU_CS_ChainPreserve: + return CSR_AMDGPU_CS_ChainPreserve_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -421,6 +423,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask : CSR_AMDGPU_SI_Gfx_RegMask; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + // Calls to these functions never return, so we can pretend everything is + // preserved. + return AMDGPU_AllVGPRs_RegMask; default: return nullptr; } @@ -430,6 +437,10 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const { return CSR_AMDGPU_NoRegs_RegMask; } +bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { + return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; +} + const TargetRegisterClass * SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { @@ -488,11 +499,11 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SIFrameLowering *TFI = ST.getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - // During ISel lowering we always reserve the stack pointer in entry + // During ISel lowering we always reserve the stack pointer in entry and chain // functions, but never actually want to reference it when accessing our own // frame. If we need a frame pointer we use it, but otherwise we can just use // an immediate "0" which we represent by returning NoRegister. - if (FuncInfo->isEntryFunction()) { + if (FuncInfo->isBottomOfStack()) { return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); } return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() @@ -712,9 +723,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); - for (auto Reg : MFI->getSGPRSpillVGPRs()) - reserveRegisterTuples(Reserved, Reg); - return Reserved; } @@ -725,12 +733,12 @@ bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - // On entry, the base address is 0, so it can't possibly need any more - // alignment. + // On entry or in chain functions, the base address is 0, so it can't possibly + // need any more alignment. // FIXME: Should be able to specify the entry frame alignment per calling // convention instead. - if (Info->isEntryFunction()) + if (Info->isBottomOfStack()) return false; return TargetRegisterInfo::shouldRealignStack(MF); @@ -796,10 +804,10 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { int64_t FullOffset = Offset + getScratchInstrOffset(MI); + const SIInstrInfo *TII = ST.getInstrInfo(); if (SIInstrInfo::isMUBUF(*MI)) - return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset); + return !TII->isLegalMUBUFImmOffset(FullOffset); - const SIInstrInfo *TII = ST.getInstrInfo(); return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); } @@ -897,8 +905,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, assert(SOffset->isImm() && SOffset->getImm() == 0); #endif - assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && - "offset should be legal"); + assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal"); FIOp->ChangeToRegister(BaseReg, false); OffsetOp->setImm(NewOffset); @@ -912,10 +919,10 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, int64_t NewOffset = Offset + getScratchInstrOffset(MI); + const SIInstrInfo *TII = ST.getInstrInfo(); if (SIInstrInfo::isMUBUF(*MI)) - return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset); + return TII->isLegalMUBUFImmOffset(NewOffset); - const SIInstrInfo *TII = ST.getInstrInfo(); return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); } @@ -1068,6 +1075,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_AV32_RESTORE: case AMDGPU::SI_SPILL_WWM_V32_SAVE: case AMDGPU::SI_SPILL_WWM_V32_RESTORE: + case AMDGPU::SI_SPILL_WWM_AV32_SAVE: + case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -1310,8 +1319,8 @@ void SIRegisterInfo::buildSpillLoadStore( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill, MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, - RegScavenger *RS, LivePhysRegs *LiveRegs) const { - assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both"); + RegScavenger *RS, LiveRegUnits *LiveUnits) const { + assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both"); MachineFunction *MF = MBB.getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -1394,12 +1403,12 @@ void SIRegisterInfo::buildSpillLoadStore( bool IsOffsetLegal = IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch) - : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset); + : TII->isLegalMUBUFImmOffset(MaxOffset); if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { SOffset = MCRegister(); // We don't have access to the register scavenger if this function is called - // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. + // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case. // TODO: Clobbering SCC is not necessary for scratch instructions in the // entry. if (RS) { @@ -1407,10 +1416,10 @@ void SIRegisterInfo::buildSpillLoadStore( // Piggy back on the liveness scan we just did see if SCC is dead. CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); - } else if (LiveRegs) { - CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); + } else if (LiveUnits) { + CanClobberSCC = LiveUnits->available(AMDGPU::SCC); for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { - if (LiveRegs->available(MF->getRegInfo(), Reg)) { + if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { SOffset = Reg; break; } @@ -1426,9 +1435,9 @@ void SIRegisterInfo::buildSpillLoadStore( if (RS) { TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); } else { - assert(LiveRegs); + assert(LiveUnits); for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { - if (LiveRegs->available(MF->getRegInfo(), Reg)) { + if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) { TmpOffsetVGPR = Reg; break; } @@ -1639,7 +1648,7 @@ void SIRegisterInfo::buildSpillLoadStore( if (UseVGPROffset && ScratchOffsetReg) { MIB.addReg(ScratchOffsetReg); } else { - assert(FuncInfo->isEntryFunction()); + assert(FuncInfo->isBottomOfStack()); MIB.addImm(0); } } @@ -1736,10 +1745,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef<SpilledReg> VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1767,7 +1779,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, // Mark the "old value of vgpr" input undef only if this is the first sgpr // spill to this specific vgpr in the first basic block. auto MIB = BuildMI(*SB.MBB, MI, SB.DL, - SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) + SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR) .addReg(SubReg, getKillRegState(UseKill)) .addImm(Spill.Lane) .addReg(Spill.VGPR); @@ -1813,8 +1825,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); MachineInstrBuilder WriteLane = - BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32), - SB.TmpVGPR) + BuildMI(*SB.MBB, MI, SB.DL, + SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR) .addReg(SubReg, SubKillState) .addImm(i % PVD.PerVGPR) .addReg(SB.TmpVGPR, TmpVGPRFlags); @@ -1856,10 +1868,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef<SpilledReg> VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1872,8 +1887,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); SpilledReg Spill = VGPRSpills[i]; - auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), - SubReg) + auto MIB = BuildMI(*SB.MBB, MI, SB.DL, + SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); if (SB.NumSubRegs > 1 && i == 0) @@ -1906,7 +1921,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, bool LastSubReg = (i + 1 == e); auto MIB = BuildMI(*SB.MBB, MI, SB.DL, - SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) + SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(SB.TmpVGPR, getKillRegState(LastSubReg)) .addImm(i); if (SB.NumSubRegs > 1 && i == 0) @@ -2005,7 +2020,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, /// handled. bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, - SlotIndexes *Indexes, LiveIntervals *LIS) const { + SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -2021,7 +2036,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, Indexes, LIS, true); + return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S384_RESTORE: @@ -2036,7 +2051,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, Indexes, LIS, true); + return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); default: llvm_unreachable("not an SGPR spill instruction"); } @@ -2141,7 +2156,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV96_SAVE: case AMDGPU::SI_SPILL_AV64_SAVE: case AMDGPU::SI_SPILL_AV32_SAVE: - case AMDGPU::SI_SPILL_WWM_V32_SAVE: { + case AMDGPU::SI_SPILL_WWM_V32_SAVE: + case AMDGPU::SI_SPILL_WWM_AV32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2208,7 +2224,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV384_RESTORE: case AMDGPU::SI_SPILL_AV512_RESTORE: case AMDGPU::SI_SPILL_AV1024_RESTORE: - case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: + case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2406,7 +2423,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, bool IsMUBUF = TII->isMUBUF(*MI); - if (!IsMUBUF && !MFI->isEntryFunction()) { + if (!IsMUBUF && !MFI->isBottomOfStack()) { // Convert to a swizzled stack address by scaling by the wave size. // In an entry function/kernel the offset is already swizzled. bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); @@ -2425,10 +2442,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (Offset == 0) { unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64; - // XXX - This never happens because of emergency scavenging slot at 0? - auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); + auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg); + if (OpCode == AMDGPU::V_LSHRREV_B32_e64) + // For V_LSHRREV, the operands are reversed (the shift count goes + // first). + Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg); + else + Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2()); if (IsSALU && !LiveSCC) Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. if (IsSALU && LiveSCC) { @@ -2541,7 +2561,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); int64_t NewOffset = OldImm + Offset; - if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) && + if (TII->isLegalMUBUFImmOffset(NewOffset) && buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); return true; @@ -2568,6 +2588,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { return AMDGPUInstPrinter::getRegisterName(Reg); } +unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) { + return getRegBitWidth(RC.getID()); +} + static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 64) @@ -3059,7 +3083,8 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, const RegisterBank &RB) const { switch (RB.getID()) { case AMDGPU::VGPRRegBankID: - return getVGPRClassForBitWidth(std::max(32u, Size)); + return getVGPRClassForBitWidth( + std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); case AMDGPU::VCCRegBankID: assert(Size == 1); return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 2120b47c581e..88d568672098 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H +#include "llvm/ADT/BitVector.h" + #define GET_REGINFO_HEADER #include "AMDGPUGenRegisterInfo.inc" @@ -23,7 +25,7 @@ namespace llvm { class GCNSubtarget; class LiveIntervals; -class LivePhysRegs; +class LiveRegUnits; class RegisterBank; struct SGPRSpillBuilder; @@ -90,6 +92,11 @@ public: CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; + // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve calling + // conventions are free to use certain VGPRs without saving and restoring any + // lanes (not even inactive ones). + static bool isChainScratchRegister(Register VGPR); + // Stack access is very expensive. CSRs are also the high registers, and we // want to minimize the number of used registers. unsigned getCSRFirstUseCost() const override { @@ -142,31 +149,30 @@ public: void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill = true) const; - /// If \p OnlyToVGPR is true, this will only succeed if this + /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a + /// free VGPR lane to spill. bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const; - bool supportsBackwardScavenger() const override { - return true; - } - bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; - bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS, - SlotIndexes *Indexes = nullptr, - LiveIntervals *LIS = nullptr) const; + bool eliminateSGPRToVGPRSpillFrameIndex( + MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, + bool SpillToPhysVGPRLane = false) const; StringRef getRegAsmName(MCRegister Reg) const override; @@ -416,14 +422,14 @@ public: // Insert spill or restore instructions. // When lowering spill pseudos, the RegScavenger should be set. // For creating spill instructions during frame lowering, where no scavenger - // is available, LiveRegs can be used. + // is available, LiveUnits can be used. void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, - LivePhysRegs *LiveRegs = nullptr) const; + LiveRegUnits *LiveUnits = nullptr) const; // Return alignment in register file of first register in a register tuple. unsigned getRegClassAlignmentNumBits(const TargetRegisterClass *RC) const { @@ -445,6 +451,11 @@ public: unsigned SubReg) const; }; +namespace AMDGPU { +/// Get the size in bits of a register from the register class \p RC. +unsigned getRegBitWidth(const TargetRegisterClass &RC); +} // namespace AMDGPU + } // End namespace llvm #endif diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index b2b1b458a63a..981da13fe089 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -122,10 +122,18 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC, //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// -class SIReg <string n, bits<16> regIdx = 0> : - Register<n> { +class SIReg <string n, bits<8> regIdx = 0, bit isAGPROrVGPR = 0, + bit isHi = 0> : Register<n> { let Namespace = "AMDGPU"; - let HWEncoding = regIdx; + + // These are generic helper values we use to form actual register + // codes. They should not be assumed to match any particular register + // encodings on any particular subtargets. + let HWEncoding{7-0} = regIdx; + let HWEncoding{8} = isAGPROrVGPR; + let HWEncoding{9} = isHi; + + int Index = !cast<int>(regIdx); } // For register classes that use TSFlags. @@ -148,28 +156,22 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> let TSFlags{4} = HasSGPR; } -multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1, - bit HWEncodingHigh = 0> { - // There is no special encoding for 16 bit subregs, these are not real - // registers but rather operands for instructions preserving other 16 bits - // of the result or reading just 16 bits of a 32 bit VGPR. - // It is encoded as a corresponding 32 bit register. - // Non-VGPR register classes use it as we need to have matching subregisters - // to move instructions and data between ALUs. - def _LO16 : SIReg<n#".l", regIdx> { - let HWEncoding{8} = HWEncodingHigh; - } - def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx> { +multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1, + bit isAGPROrVGPR = 0> { + def _LO16 : SIReg<n#".l", regIdx, isAGPROrVGPR>; + def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx, isAGPROrVGPR, + /* isHi */ 1> { let isArtificial = ArtificialHigh; - let HWEncoding{8} = HWEncodingHigh; } def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"), !cast<Register>(NAME#"_HI16")]> { let Namespace = "AMDGPU"; let SubRegIndices = [lo16, hi16]; let CoveredBySubRegs = !not(ArtificialHigh); - let HWEncoding = regIdx; - let HWEncoding{8} = HWEncodingHigh; + let HWEncoding{7-0} = regIdx; + let HWEncoding{8} = isAGPROrVGPR; + + int Index = !cast<int>(regIdx); } } @@ -247,7 +249,7 @@ def SGPR_NULL64 : // the high 32 bits. The lower 32 bits are always zero (for base) or // -1 (for limit). Since we cannot access the high 32 bits, when we // need them, we need to do a 64 bit load and extract the bits manually. -multiclass ApertureRegister<string name, bits<16> regIdx> { +multiclass ApertureRegister<string name, bits<8> regIdx> { let isConstant = true in { // FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit // register classes), but if we don't it seems to confuse the TableGen @@ -315,7 +317,7 @@ foreach Index = 0...15 in { defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>; } -multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { +multiclass FLAT_SCR_LOHI_m <string n, bits<8> ci_e, bits<8> vi_e> { defm _ci : SIRegLoHi16<n, ci_e>; defm _vi : SIRegLoHi16<n, vi_e>; defm "" : SIRegLoHi16<n, 0>; @@ -412,7 +414,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, // SGPR 64-bit registers def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">; -// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. +// SGPR 96-bit registers. def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">; // SGPR 128-bit registers @@ -591,7 +593,6 @@ def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; - let BaseClassOrder = 16; } def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, @@ -599,9 +600,34 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, let AllocationPriority = 0; let Size = 16; let GeneratePressureSet = 0; +} + +// VOP3 and VINTERP can access 256 lo and 256 hi registers. +def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, + (add (interleave (sequence "VGPR%u_LO16", 0, 255), + (sequence "VGPR%u_HI16", 0, 255)))> { + let AllocationPriority = 2; + let Size = 16; + let GeneratePressureSet = 0; + + // This is the base class for VGPR{128..255}_{LO16,HI16}. let BaseClassOrder = 17; } +// VOP1/2/C can access the First 128 lo and 128 hi registers. +// The order of registers in the class determines order of allocation, so it is +// important to interleave lo and hi registers. +def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, + (add (interleave (sequence "VGPR%u_LO16", 0, 127), + (sequence "VGPR%u_HI16", 0, 127)))> { + let Size = 16; + let GeneratePressureSet = 0; + let isAllocatable = 0; + + // This is the base class for VGPR{0..127}_{LO16,HI16}. + let BaseClassOrder = 16; +} + // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, @@ -904,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>; let GlobalPriority = true in { -defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; } @@ -958,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>; let GlobalPriority = true in { -defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; } @@ -1008,6 +1034,18 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> { let HasVGPR = 1; } +def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, + (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> { + let isAllocatable = 0; + let HasVGPR = 1; +} + +def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, + (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> { + let isAllocatable = 0; + let HasVGPR = 1; +} + def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; @@ -1094,6 +1132,30 @@ class RegOrF16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16", !subst("_f16", "F16", NAME), "_Imm16">; +class RegOrB16T <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16", + !subst("_b16", "B16", NAME), "_Imm16"> { + let EncoderMethod = "getMachineOpValueT16"; +} + +class RegOrF16T <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16", + !subst("_f16", "F16", NAME), "_Imm16"> { + let EncoderMethod = "getMachineOpValueT16"; +} + +class RegOrB16_Lo128T <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16", + !subst("_b16_Lo128", "B16_Lo128", NAME), "_Imm16"> { + let EncoderMethod = "getMachineOpValueT16Lo128"; +} + +class RegOrF16_Lo128T <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16", + !subst("_f16_Lo128", "F16_Lo128", NAME), "_Imm16"> { + let EncoderMethod = "getMachineOpValueT16Lo128"; +} + class RegOrB32 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT32", !subst("_b32", "B32", NAME), "_Imm32">; @@ -1149,10 +1211,13 @@ class RegOrF16_Lo128_Deferred <string RegisterClass, : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED", !subst("_f16_Lo128_Deferred", "F16_Lo128", NAME), "_Deferred_Imm16">; + //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// +def SSrc_b16 : RegOrB16 <"SReg_32", "OPERAND_REG_IMM">; +def SSrc_f16 : RegOrF16 <"SReg_32", "OPERAND_REG_IMM">; def SSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_IMM">; def SSrc_f32 : RegOrF32 <"SReg_32", "OPERAND_REG_IMM">; def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">; @@ -1160,6 +1225,13 @@ def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">; def SSrcOrLds_b32 : RegOrB32 <"SRegOrLds_32", "OPERAND_REG_IMM">; //===----------------------------------------------------------------------===// +// SSrc_32_Deferred Operands with an SGPR or a 32-bit immediate for use with +// FMAMK/FMAAK +//===----------------------------------------------------------------------===// + +def SSrc_f32_Deferred : RegOrF32_Deferred<"SReg_32", "OPERAND_REG_IMM">; + +//===----------------------------------------------------------------------===// // SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// @@ -1170,20 +1242,41 @@ def SCSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_INLINE_C">; // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// +// The current and temporary future default used case for VOP3. def VSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_IMM">; def VSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_IMM">; + +// True16 VOP3 operands. +def VSrcT_b16 : RegOrB16T <"VS_16", "OPERAND_REG_IMM"> { + let DecoderMethod = "decodeOperand_VSrcT16"; +} +def VSrcT_f16 : RegOrF16T <"VS_16", "OPERAND_REG_IMM"> { + let DecoderMethod = "decodeOperand_VSrcT16"; +} + +// True16 VOP1/2/C operands. +def VSrcT_b16_Lo128 : RegOrB16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> { + let DecoderMethod = "decodeOperand_VSrcT16_Lo128"; +} +def VSrcT_f16_Lo128 : RegOrF16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> { + let DecoderMethod = "decodeOperand_VSrcT16_Lo128"; +} + +// The current and temporary future default used case for fake VOP1/2/C. +def VSrcFake16_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; +def VSrcFake16_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; + def VSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_IMM">; def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">; def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">; def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">; def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">; -def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">; +def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM"> { + let DecoderMethod = "decodeOperand_VSrc_f64"; +} def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">; def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">; -def VSrcT_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; -def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; - //===----------------------------------------------------------------------===// // VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use // with FMAMK/FMAAK @@ -1192,8 +1285,8 @@ def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; def VSrc_f16_Deferred : RegOrF16_Deferred<"VS_32", "OPERAND_REG_IMM">; def VSrc_f32_Deferred : RegOrF32_Deferred<"VS_32", "OPERAND_REG_IMM">; -def VSrcT_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128", - "OPERAND_REG_IMM">; +def VSrcFake16_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128", + "OPERAND_REG_IMM">; //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR @@ -1233,6 +1326,11 @@ def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { let DecoderMethod = "DecodeVGPR_32RegisterClass"; } +def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> { + let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass"; + let EncoderMethod = "getMachineOpValueT16Lo128"; +} + //===----------------------------------------------------------------------===// // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td index 53441b5a4ced..b0e8e4112254 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td @@ -65,6 +65,12 @@ def Write16PassMAI : SchedWrite; def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; +// Scalar float instructions +def WriteSFPU : SchedWrite; + +// F16 or F32 pseudo scalar transcendental instructions +def WritePseudoScalarTrans : SchedWrite; + // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) @@ -90,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel; def SIDPGFX940FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; +def GFX12SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -128,6 +135,10 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, class HWVALUWriteRes<SchedWrite write, int latency> : HWWriteRes<write, [HWVALU], latency>; +class UnsupportedWriteRes<SchedWrite write> : WriteRes<write, []> { + let Unsupported = 1; +} + def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>; def MIReadVGPR : SchedReadVariant<[ @@ -157,14 +168,17 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<Write4PassDGEMM, 4>; def : HWVALUWriteRes<Write8PassDGEMM, 16>; - let ResourceCycles = [2] in + let ReleaseAtCycles = [2] in def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; - let ResourceCycles = [4] in + let ReleaseAtCycles = [4] in def : HWWriteRes<Write4PassMAI, [HWXDL], 4>; - let ResourceCycles = [8] in + let ReleaseAtCycles = [8] in def : HWWriteRes<Write8PassMAI, [HWXDL], 8>; - let ResourceCycles = [16] in + let ReleaseAtCycles = [16] in def : HWWriteRes<Write16PassMAI, [HWXDL], 16>; + + def : UnsupportedWriteRes<WriteSFPU>; + def : UnsupportedWriteRes<WritePseudoScalarTrans>; } // End RetireOOO = 1 def : ReadAdvance<MIVGPRRead, -2>; @@ -307,6 +321,9 @@ def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; + +def : UnsupportedWriteRes<WriteSFPU>; +def : UnsupportedWriteRes<WritePseudoScalarTrans>; } // End RetireOOO = 1 def : InstRW<[WriteCopy], (instrs COPY)>; @@ -315,26 +332,61 @@ def : InstRW<[WriteCopy], (instrs COPY)>; let SchedModel = GFX11SpeedModel in { +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +let RetireOOO = 1 in { // llvm-mca specific flag def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; -def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; +def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>; def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>; def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>; def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>; def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; -def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>; +def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 40>; def : HWWriteRes<WriteBranch, [HWBranch], 32>; def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; +def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>; def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; +} // End RetireOOO = 1 + +def : UnsupportedWriteRes<WritePseudoScalarTrans>; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = GFX11SpeedModel + +let SchedModel = GFX12SpeedModel in { + +def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; +def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; +def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; +def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; +def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>; +def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>; + +def : HWWriteRes<WriteBranch, [HWBranch], 32>; +def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; +def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; +def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>; +def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; +def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX12SpeedModel diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 4159dc694c1e..d290dd82b760 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -104,8 +104,7 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, bool ConstantFolded = false; if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { - if (MovSrc.isImm() && - (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { + if (MovSrc.isImm()) { Src0.ChangeToImmediate(MovSrc.getImm()); ConstantFolded = true; } else if (MovSrc.isFI()) { @@ -160,7 +159,7 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { } bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { - return isInt<16>(Src.getImm()) && + return isInt<16>(SignExtend64(Src.getImm(), 32)) && !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); } @@ -171,7 +170,7 @@ bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const { - if (isInt<16>(Src.getImm())) { + if (isInt<16>(SignExtend64(Src.getImm(), 32))) { IsUnsigned = false; return !TII->isInlineConstant(Src); } @@ -212,6 +211,9 @@ void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, } void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { + if (!ST->hasSCmpK()) + return; + // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to // get constants on the RHS. if (!MI.getOperand(0).isReg()) @@ -222,7 +224,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { if (!Src0.isReg()) return; - const MachineOperand &Src1 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(1); if (!Src1.isImm()) return; @@ -238,6 +240,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { if (!HasUImm) { SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; + Src1.setImm(SignExtend32(Src1.getImm(), 32)); } MI.setDesc(TII->get(SOPKOpc)); @@ -250,6 +253,8 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { + if (!TII->sopkIsZext(SOPKOpc)) + Src1.setImm(SignExtend64(Src1.getImm(), 32)); MI.setDesc(NewDesc); } } @@ -839,6 +844,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; + Src1->setImm(SignExtend64(Src1->getImm(), 32)); MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); } @@ -858,9 +864,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src.isImm() && Dst.getReg().isPhysical()) { int32_t ReverseImm; - if (isKImmOperand(Src)) + if (isKImmOperand(Src)) { MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - else if (isReverseInlineImm(Src, ReverseImm)) { + Src.setImm(SignExtend64(Src.getImm(), 32)); + } else if (isReverseInlineImm(Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); Src.setImm(ReverseImm); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 3143d437e370..59d6ccf513bb 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -238,9 +238,7 @@ public: AU.addRequired<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); - AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); AU.addPreserved<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1320,7 +1318,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (IsEntry) { // Skip the instruction that saves LiveMask - if (II != IE && II->getOpcode() == AMDGPU::COPY) + if (II != IE && II->getOpcode() == AMDGPU::COPY && + II->getOperand(1).getReg() == TRI->getExec()) ++II; } @@ -1594,8 +1593,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); - MDT = &getAnalysis<MachineDominatorTree>(); - PDT = &getAnalysis<MachinePostDominatorTree>(); + MDT = getAnalysisIfAvailable<MachineDominatorTree>(); + PDT = getAnalysisIfAvailable<MachinePostDominatorTree>(); if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td index 7ca685a0cc5d..3297847b0360 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -74,7 +74,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic> bits<7> sdst; bits<32> offset; bits<8> soffset; - bits<5> cpol; + bits<5> cpol; } class OffsetMode<bit hasOffset, bit hasSOffset, string variant, @@ -211,6 +211,23 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< let has_sbase = 0; } +class SM_Prefetch_Pseudo <string opName, RegisterClass baseClass, bit hasSBase> + : SM_Pseudo<opName, (outs), !con(!if(hasSBase, (ins baseClass:$sbase), (ins)), + (ins smem_offset:$offset, SReg_32:$soffset, i8imm:$sdata)), + !if(hasSBase, " $sbase,", "") # " $offset, $soffset, $sdata"> { + // Mark prefetches as both load and store to prevent reordering with loads + // and stores. This is also needed for pattern to match prefetch intrinsic. + let mayLoad = 1; + let mayStore = 1; + let has_glc = 0; + let LGKM_CNT = 0; + let has_sbase = hasSBase; + let ScalarStore = 0; + let has_offset = 1; + let has_soffset = 1; + let PseudoInstr = opName; +} + //===----------------------------------------------------------------------===// // Scalar Atomic Memory Classes //===----------------------------------------------------------------------===// @@ -234,8 +251,6 @@ class SM_Atomic_Pseudo <string opName, let IsAtomicNoRet = !not(isRet); let IsAtomicRet = isRet; - - let AsmMatchConverter = "cvtSMEMAtomic"; } class SM_Pseudo_Atomic<string opName, @@ -245,7 +260,7 @@ class SM_Pseudo_Atomic<string opName, bit isRet, string opNameWithSuffix = opName # offsets.Variant # !if(isRet, "_RTN", ""), - Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> : + Operand CPolTy = !if(isRet, CPol_GLC, CPol_NonGLC)> : SM_Atomic_Pseudo<opName, !if(isRet, (outs dataClass:$sdst), (outs)), !con((ins dataClass:$sdata, baseClass:$sbase), offsets.Ins, @@ -285,6 +300,8 @@ multiclass SM_Pseudo_Atomics<RegisterClass baseClass, // does sdst for SMRD on SI/CI? defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>; +let SubtargetPredicate = HasScalarDwordx3Loads in + defm S_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_64, SReg_96>; defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>; @@ -294,6 +311,8 @@ defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on // SI/CI, bit disallowed for SMEM on VI. defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>; +let SubtargetPredicate = HasScalarDwordx3Loads in + defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>; defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>; defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>; defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>; @@ -417,6 +436,16 @@ defm S_DCACHE_DISCARD : SM_Pseudo_Discards; defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards; } +let SubtargetPredicate = isGFX12Plus in { +def S_PREFETCH_INST : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>; +def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>; +def S_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>; +def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>; +def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> { + let is_buffer = 1; +} +} // end let SubtargetPredicate = isGFX12Plus + //===----------------------------------------------------------------------===// // Targets //===----------------------------------------------------------------------===// @@ -789,6 +818,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL }]; } +def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type), + (prefetch node:$ptr, node:$rw, node:$loc, node:$type), + [{ return !N->getOperand(1)->isDivergent();}]> { + let GISelPredicateCode = [{ + return isInstrUniform(MI); + }]; +} + def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; @@ -797,7 +834,7 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">; -multiclass SMRD_Pattern <string Instr, ValueType vt> { +multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> { // 1. IMM offset def : GCNPat < @@ -806,7 +843,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { >; // 2. 32-bit IMM offset on CI - def : GCNPat < + if immci then def : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { let OtherPredicates = [isGFX7Only]; @@ -838,7 +875,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { >; } -multiclass SMLoad_Pattern <string Instr, ValueType vt> { +multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> { // 1. Offset as an immediate def : GCNPat < (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), @@ -847,7 +884,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { } // 2. 32-bit IMM offset on CI - def : GCNPat < + if immci then def : GCNPat < (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)), (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset, (extract_cpol $cachepolicy))> { @@ -890,6 +927,10 @@ foreach vt = SReg_64.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>; } +foreach vt = SReg_96.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX3", vt, false>; +} + foreach vt = SReg_128.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; } @@ -906,12 +947,14 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3i32, false>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3f32, false>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; @@ -934,6 +977,21 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] +multiclass SMPrefetchPat<string type, int cache_type> { + def : GCNPat < + (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) + >; + + def : GCNPat < + (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) + >; +} + +defm : SMPrefetchPat<"INST", 0>; +defm : SMPrefetchPat<"DATA", 1>; + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// @@ -1154,7 +1212,7 @@ def SMInfoTable : GenericTable { class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> : SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11, SGPR_NULL_gfx11plus> { - let AssemblerPredicate = isGFX11Plus; + let AssemblerPredicate = isGFX11Only; let DecoderNamespace = "GFX11"; let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0); let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0); @@ -1205,3 +1263,84 @@ multiclass SM_Real_Probe_gfx11<bits<8> op> { defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22>; defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>; + +//===----------------------------------------------------------------------===// +// GFX12. +//===----------------------------------------------------------------------===// + +class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName, + int subtarget, RegisterWithSubRegs sgpr_null> : + SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 { + + let Inst{18-13} = op; + let Inst{31-26} = 0x3d; + + let Inst{55-32} = !if(ps.has_offset, offset{23-0}, !if(ps.has_soffset, 0, ?)); + let Inst{63-57} = !if(ps.has_soffset, soffset{6-0}, + !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?)); +} + +class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> : + SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12, + SGPR_NULL_gfx11plus> { + let AssemblerPredicate = isGFX12Plus; + let DecoderNamespace = "GFX12"; + + let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); + let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); +} + +class SMEM_Real_Prefetch_gfx12<bits<6> op, SM_Pseudo ps> : + SMEM_Real_gfx12<op, ps> { + bits<7> sdata; // Only 5 bits of sdata are supported. + + let sdst = ?; + let Inst{12-11} = 0; // Unused sdata bits. + let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?); +} + +class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offsets> : + SMEM_Real_gfx12<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); + + let Inst{22-21} = cpol{4-3}; // scope + let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported +} + +multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> { + defvar opName = !tolower(NAME); + def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>; + def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>; +} + +defm S_LOAD_B32 : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">; +defm S_LOAD_B64 : SM_Real_Loads_gfx12<0x01, "S_LOAD_DWORDX2">; +defm S_LOAD_B96 : SM_Real_Loads_gfx12<0x05, "S_LOAD_DWORDX3">; +defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">; +defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">; +defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">; + +defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">; +defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">; + +def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>; + +def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>; +def S_PREFETCH_INST_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x25, S_PREFETCH_INST_PC_REL>; +def S_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x26, S_PREFETCH_DATA>; +def S_BUFFER_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x27, S_BUFFER_PREFETCH_DATA>; +def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DATA_PC_REL>; + +multiclass SMEM_Real_Probe_gfx12<bits<6> op> { + defvar ps = NAME; + def _IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; + def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>; +} + +defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>; +defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td index bee996d1b0df..c9687ac368d3 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -15,6 +15,7 @@ class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps, let isPseudo = 1; let isCodeGenOnly = 1; + let Size = 4; string Mnemonic = opName; string AsmOperands = asmOps; @@ -36,7 +37,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, let SALU = 1; let SOP1 = 1; let SchedRW = [WriteSALU]; - let Size = 4; let UseNamedOperandTable = 1; bits<1> has_src0 = 1; @@ -216,8 +216,10 @@ let Defs = [SCC] in { def S_NOT_B64 : SOP1_64 <"s_not_b64", [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))] >; - def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; - def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; + def S_WQM_B32 : SOP1_32 <"s_wqm_b32", + [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64", + [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>; } // End Defs = [SCC] @@ -290,6 +292,7 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; +let isReMaterializable = 1 in def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; @@ -326,8 +329,10 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">; } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] -def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">; -def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">; +def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32", + [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>; +def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64", + [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>; let Uses = [M0] in { def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">; @@ -362,7 +367,8 @@ let SubtargetPredicate = isGFX9Plus in { } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] let isReMaterializable = 1 in - def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; + def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32", + [(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>; } // End SubtargetPredicate = isGFX9Plus let SubtargetPredicate = isGFX10Plus in { @@ -401,6 +407,120 @@ let SubtargetPredicate = isGFX11Plus in { } } // End SubtargetPredicate = isGFX11Plus +class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32, + ValueType vt1=vt0> : + SOP1_32<opName, [(set vt0:$sdst, (UniformUnaryFrag<Op> vt1:$src0))]>; + +let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE], + SchedRW = [WriteSFPU], isReMaterializable = 1 in { + def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>; + def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>; + + let mayRaiseFPException = 1 in { + def S_CVT_I32_F32 : SOP1_F32_Inst<"s_cvt_i32_f32", fp_to_sint, i32, f32>; + def S_CVT_U32_F32 : SOP1_F32_Inst<"s_cvt_u32_f32", fp_to_uint, i32, f32>; + def S_CVT_F32_F16 : SOP1_F32_Inst<"s_cvt_f32_f16", fpextend, f32, f16>; + def S_CVT_HI_F32_F16 : SOP1_32<"s_cvt_hi_f32_f16">; + + def S_CEIL_F32 : SOP1_F32_Inst<"s_ceil_f32", fceil>; + def S_FLOOR_F32 : SOP1_F32_Inst<"s_floor_f32", ffloor>; + def S_TRUNC_F32 : SOP1_F32_Inst<"s_trunc_f32", ftrunc>; + def S_RNDNE_F32 : SOP1_F32_Inst<"s_rndne_f32", froundeven>; + + let FPDPRounding = 1 in + def S_CVT_F16_F32 : SOP1_F32_Inst<"s_cvt_f16_f32", fpround, f16, f32>; + + def S_CEIL_F16 : SOP1_F32_Inst<"s_ceil_f16", fceil, f16>; + def S_FLOOR_F16 : SOP1_F32_Inst<"s_floor_f16", ffloor, f16>; + def S_TRUNC_F16 : SOP1_F32_Inst<"s_trunc_f16", ftrunc, f16>; + def S_RNDNE_F16 : SOP1_F32_Inst<"s_rndne_f16", froundeven, f16>; + } // End mayRaiseFPException = 1 +} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE] + // SchedRW = [WriteSFPU], isReMaterializable = 1 + +let hasSideEffects = 1 in { +let has_sdst = 0 in { +let Uses = [M0] in { +def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins), + "", [(int_amdgcn_s_barrier_signal_var M0)]>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins), + "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{ + let Defs = [SCC]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} +} // End Uses = [M0] + +def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), + (ins SplitBarrier:$src0), "$src0", [(int_amdgcn_s_barrier_signal timm:$src0)]>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs), + (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{ + let Defs = [SCC]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; + + +} +} // End has_sdst = 0 + +def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), + (ins SplitBarrier:$src0), "$sdst, $src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_GET_BARRIER_STATE_M0 : SOP1_Pseudo <"s_get_barrier_state $sdst, m0", (outs SSrc_b32:$sdst), + (ins), "", []>{ + let Uses = [M0]; + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} +} // End hasSideEffects = 1 + //===----------------------------------------------------------------------===// // SOP2 Instructions //===----------------------------------------------------------------------===// @@ -424,13 +544,11 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, // let sdst = xxx in { // for multiclasses that include both real and pseudo instructions. // field bits<7> sdst = 0; - // let Size = 4; // Do we need size here? } -class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : +class SOP2_Real<SOP_Pseudo ps, string real_name = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, - real_name # ps.AsmOperands>, - Enc32 { + real_name # ps.AsmOperands> { let SALU = 1; let SOP2 = 1; let isPseudo = 0; @@ -444,12 +562,18 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : let SchedRW = ps.SchedRW; let mayLoad = ps.mayLoad; let mayStore = ps.mayStore; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; // encoding bits<7> sdst; bits<8> src0; bits<8> src1; + bits<32> imm; +} +class SOP2_Real32<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : + SOP2_Real<ps, real_name>, Enc32 { let Inst{7-0} = src0; let Inst{15-8} = src1; let Inst{22-16} = !if(ps.has_sdst, sdst, ?); @@ -457,12 +581,31 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : let Inst{31-30} = 0x2; // encoding } +class SOP2_Real64<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> : + SOP2_Real<ps, real_name>, Enc64 { + let Inst{7-0} = src0; + let Inst{15-8} = src1; + let Inst{22-16} = !if(ps.has_sdst, sdst, ?); + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding + let Inst{63-32} = imm; +} + +class SOP2_F16 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_32:$sdst), (ins SSrc_f16:$src0, SSrc_f16:$src1), + "$sdst, $src0, $src1", pattern +>; class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1), "$sdst, $src0, $src1", pattern >; +class SOP2_F32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < + opName, (outs SReg_32:$sdst), (ins SSrc_f32:$src0, SSrc_f32:$src1), + "$sdst, $src0, $src1", pattern +>; + class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), "$sdst, $src0, $src1", pattern @@ -518,19 +661,22 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32", } // End isCommutable = 1 } // End Defs = [SCC] -def SelectPat : PatFrag < - (ops node:$src1, node:$src2), - (select SCC, $src1, $src2), - [{ return !N->isDivergent(); }] ->; +let SubtargetPredicate = isGFX12Plus in { + def S_ADD_U64 : SOP2_64<"s_add_u64">{ + let isCommutable = 1; + } -let Uses = [SCC] in { - let AddedComplexity = 20 in { - def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", - [(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))] - >; + def S_SUB_U64 : SOP2_64<"s_sub_u64">; + + def S_MUL_U64 : SOP2_64 <"s_mul_u64", + [(set i64:$sdst, (UniformBinFrag<mul> i64:$src0, i64:$src1))]> { + let isCommutable = 1; } +} // End SubtargetPredicate = isGFX12Plus + +let Uses = [SCC] in { + def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">; def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; } // End Uses = [SCC] @@ -705,6 +851,83 @@ let SubtargetPredicate = isGFX11Plus in { def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">; } // End SubtargetPredicate = isGFX11Plus +class SOP2_F32_Inst<string opName, SDPatternOperator Op, ValueType dstVt=f32> : + SOP2_F32<opName, + [(set dstVt:$sdst, (UniformBinFrag<Op> SSrc_f32:$src0, SSrc_f32:$src1))]>; + +class SOP2_F16_Inst<string opName, SDPatternOperator Op> : + SOP2_F16<opName, + [(set f16:$sdst, (UniformBinFrag<Op> SSrc_f16:$src0, SSrc_f16:$src1))]>; + +let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1, + Uses = [MODE], SchedRW = [WriteSFPU] in { + let isReMaterializable = 1 in { + let isCommutable = 1 in { + def S_ADD_F32 : SOP2_F32_Inst<"s_add_f32", any_fadd>; + def S_MIN_F32 : SOP2_F32_Inst<"s_min_f32", fminnum_like>; + def S_MAX_F32 : SOP2_F32_Inst<"s_max_f32", fmaxnum_like>; + def S_MUL_F32 : SOP2_F32_Inst<"s_mul_f32", any_fmul>; + + let FixedSize = 1 in + def S_FMAAK_F32 : SOP2_Pseudo< + "s_fmaak_f32", (outs SReg_32:$sdst), + (ins SSrc_f32_Deferred:$src0, SSrc_f32_Deferred:$src1, KImmFP32:$imm), + "$sdst, $src0, $src1, $imm" + >; + + let FPDPRounding = 1 in { + def S_ADD_F16 : SOP2_F16_Inst<"s_add_f16", any_fadd>; + def S_MUL_F16 : SOP2_F16_Inst<"s_mul_f16", any_fmul>; + } // End FPDPRounding + + def S_MIN_F16 : SOP2_F16_Inst<"s_min_f16", fminnum_like>; + def S_MAX_F16 : SOP2_F16_Inst<"s_max_f16", fmaxnum_like>; + } // End isCommutable = 1 + + let FPDPRounding = 1 in + def S_SUB_F16 : SOP2_F16_Inst<"s_sub_f16", any_fsub>; + + def S_SUB_F32 : SOP2_F32_Inst<"s_sub_f32", any_fsub>; + def S_CVT_PK_RTZ_F16_F32 : SOP2_F32_Inst<"s_cvt_pk_rtz_f16_f32", + AMDGPUpkrtz_f16_f32, v2f16>; + + let FixedSize = 1 in + def S_FMAMK_F32 : SOP2_Pseudo< + "s_fmamk_f32", (outs SReg_32:$sdst), + (ins SSrc_f32_Deferred:$src0, KImmFP32:$imm, SSrc_f32_Deferred:$src1), + "$sdst, $src0, $imm, $src1" + >; + } // End isReMaterializable = 1 + + let Constraints = "$sdst = $src2", DisableEncoding="$src2", + isCommutable = 1, AddedComplexity = 20 in { + def S_FMAC_F32 : SOP2_Pseudo< + "s_fmac_f32", (outs SReg_32:$sdst), + (ins SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2), + "$sdst, $src0, $src1", + [(set f32:$sdst, (UniformTernaryFrag<any_fma> SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2))] + >; + + def S_FMAC_F16 : SOP2_Pseudo< + "s_fmac_f16", (outs SReg_32:$sdst), + (ins SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2), + "$sdst, $src0, $src1", + [(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))] + >; + } // End Constraints = "$sdst = $src2", DisableEncoding="$src2", + // isCommutable = 1, AddedComplexity = 20 +} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1, + // Uses = [MODE], SchedRW = [WriteSFPU] + +// On GFX12 MIN/MAX instructions do not read MODE register. +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 1, isCommutable = 1, + isReMaterializable = 1, SchedRW = [WriteSFPU] in { + def S_MINIMUM_F32 : SOP2_F32_Inst<"s_minimum_f32", fminimum>; + def S_MAXIMUM_F32 : SOP2_F32_Inst<"s_maximum_f32", fmaximum>; + def S_MINIMUM_F16 : SOP2_F16_Inst<"s_minimum_f16", fminimum>; + def S_MAXIMUM_F16 : SOP2_F16_Inst<"s_maximum_f16", fmaximum>; +} + //===----------------------------------------------------------------------===// // SOPK Instructions //===----------------------------------------------------------------------===// @@ -724,9 +947,9 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, let has_sdst = 1; } -class SOPK_Real<SOPK_Pseudo ps> : +class SOPK_Real<SOPK_Pseudo ps, string real_name = ps.Mnemonic> : InstSI <ps.OutOperandList, ps.InOperandList, - ps.Mnemonic # ps.AsmOperands> { + real_name # ps.AsmOperands> { let SALU = 1; let SOPK = 1; let isPseudo = 0; @@ -750,8 +973,8 @@ class SOPK_Real<SOPK_Pseudo ps> : bits<32> imm; } -class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> : - SOPK_Real <ps>, +class SOPK_Real32<bits<5> op, SOPK_Pseudo ps, string real_name = ps.Mnemonic> : + SOPK_Real <ps, real_name>, Enc32 { let Inst{15-0} = simm16; let Inst{22-16} = !if(ps.has_sdst, sdst, ?); @@ -870,6 +1093,8 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo < // This is hasSideEffects to allow its use in readcyclecounter selection. // FIXME: Need to truncate immediate to 16-bits. +// FIXME: Missing mode register use. Should have separate pseudos for +// known may read MODE and only read MODE. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), @@ -956,10 +1181,14 @@ let SubtargetPredicate = isGFX10Plus in { "$simm16"> { let has_sdst = 0; } +} // End SubtargetPredicate = isGFX10Plus +let SubtargetPredicate = isGFX10GFX11 in { def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">; def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">; +} // End SubtargetPredicate = isGFX10GFX11 +let SubtargetPredicate = isGFX10Plus in { def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">; def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">; def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">; @@ -1033,6 +1262,30 @@ class SOPC_CMP_32<string opName, let isCommutable = 1; } +class SOPC_CMP_F32<string opName, + SDPatternOperator cond = COND_NULL, string revOp = opName> + : SOPC_Helper<SSrc_b32, f32, opName, cond>, + Commutable_REV<revOp, !eq(revOp, opName)>, + SOPKInstTable<0, opName> { + let isCompare = 1; + let isCommutable = 1; + let mayRaiseFPException = 1; + let Uses = [MODE]; + let SchedRW = [WriteSFPU]; +} + +class SOPC_CMP_F16<string opName, + SDPatternOperator cond = COND_NULL, string revOp = opName> + : SOPC_Helper<SSrc_b16, f16, opName, cond>, + Commutable_REV<revOp, !eq(revOp, opName)>, + SOPKInstTable<0, opName> { + let isCompare = 1; + let isCommutable = 1; + let mayRaiseFPException = 1; + let Uses = [MODE]; + let SchedRW = [WriteSFPU]; +} + class SOPC_CMP_64<string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> : SOPC_Helper<SSrc_b64, i64, opName, cond>, @@ -1089,6 +1342,40 @@ def S_SET_GPR_IDX_ON : SOPC_Pseudo < } } +let SubtargetPredicate = HasSALUFloatInsts in { + +def S_CMP_LT_F32 : SOPC_CMP_F32<"s_cmp_lt_f32", COND_OLT, "s_cmp_gt_f32">; +def S_CMP_EQ_F32 : SOPC_CMP_F32<"s_cmp_eq_f32", COND_OEQ>; +def S_CMP_LE_F32 : SOPC_CMP_F32<"s_cmp_le_f32", COND_OLE, "s_cmp_ge_f32">; +def S_CMP_GT_F32 : SOPC_CMP_F32<"s_cmp_gt_f32", COND_OGT>; +def S_CMP_LG_F32 : SOPC_CMP_F32<"s_cmp_lg_f32", COND_ONE>; +def S_CMP_GE_F32 : SOPC_CMP_F32<"s_cmp_ge_f32", COND_OGE>; +def S_CMP_O_F32 : SOPC_CMP_F32<"s_cmp_o_f32", COND_O>; +def S_CMP_U_F32 : SOPC_CMP_F32<"s_cmp_u_f32", COND_UO>; +def S_CMP_NGE_F32 : SOPC_CMP_F32<"s_cmp_nge_f32", COND_ULT, "s_cmp_nle_f32">; +def S_CMP_NLG_F32 : SOPC_CMP_F32<"s_cmp_nlg_f32", COND_UEQ>; +def S_CMP_NGT_F32 : SOPC_CMP_F32<"s_cmp_ngt_f32", COND_ULE, "s_cmp_nlt_f32">; +def S_CMP_NLE_F32 : SOPC_CMP_F32<"s_cmp_nle_f32", COND_UGT>; +def S_CMP_NEQ_F32 : SOPC_CMP_F32<"s_cmp_neq_f32", COND_UNE>; +def S_CMP_NLT_F32 : SOPC_CMP_F32<"s_cmp_nlt_f32", COND_UGE>; + +def S_CMP_LT_F16 : SOPC_CMP_F16<"s_cmp_lt_f16", COND_OLT, "s_cmp_gt_f16">; +def S_CMP_EQ_F16 : SOPC_CMP_F16<"s_cmp_eq_f16", COND_OEQ>; +def S_CMP_LE_F16 : SOPC_CMP_F16<"s_cmp_le_f16", COND_OLE, "s_cmp_ge_f16">; +def S_CMP_GT_F16 : SOPC_CMP_F16<"s_cmp_gt_f16", COND_OGT>; +def S_CMP_LG_F16 : SOPC_CMP_F16<"s_cmp_lg_f16", COND_ONE>; +def S_CMP_GE_F16 : SOPC_CMP_F16<"s_cmp_ge_f16", COND_OGE>; +def S_CMP_O_F16 : SOPC_CMP_F16<"s_cmp_o_f16", COND_O>; +def S_CMP_U_F16 : SOPC_CMP_F16<"s_cmp_u_f16", COND_UO>; +def S_CMP_NGE_F16 : SOPC_CMP_F16<"s_cmp_nge_f16", COND_ULT, "s_cmp_nle_f16">; +def S_CMP_NLG_F16 : SOPC_CMP_F16<"s_cmp_nlg_f16", COND_UEQ>; +def S_CMP_NGT_F16 : SOPC_CMP_F16<"s_cmp_ngt_f16", COND_ULE, "s_cmp_nlt_f16">; +def S_CMP_NLE_F16 : SOPC_CMP_F16<"s_cmp_nle_f16", COND_UGT>; +def S_CMP_NEQ_F16 : SOPC_CMP_F16<"s_cmp_neq_f16", COND_UNE>; +def S_CMP_NLT_F16 : SOPC_CMP_F16<"s_cmp_nlt_f16", COND_UGE>; + +} // End SubtargetPredicate = HasSALUFloatInsts + //===----------------------------------------------------------------------===// // SOPP Instructions //===----------------------------------------------------------------------===// @@ -1161,7 +1448,10 @@ multiclass SOPP_With_Relaxation <string opName, dag ins, def _pad_s_nop : SOPP_Pseudo <opName # "_pad_s_nop", ins, asmOps, pattern, " ", opName>; } -def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">; +def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16", + [(int_amdgcn_s_nop timm:$simm16)]> { + let hasSideEffects = 1; +} let isTerminator = 1 in { def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins Endpgm:$simm16), "$simm16", [], ""> { @@ -1264,6 +1554,21 @@ def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "", let isConvergent = 1; } +def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm16", + [(int_amdgcn_s_barrier_wait timm:$simm16)]> { + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "", + [(set SCC, (int_amdgcn_s_barrier_leave))]> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let fixed_imm = 1; + let isConvergent = 1; + let Defs = [SCC]; +} + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; @@ -1272,9 +1577,19 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let mayStore = 1; } -let hasSideEffects = 1 in def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; + +// "_soft" waitcnts are waitcnts that are either relaxed into their non-soft +// counterpart, or completely removed. +// +// These are inserted by SIMemoryLegalizer to resolve memory dependencies +// and later optimized by SIInsertWaitcnts +// For example, a S_WAITCNT_soft 0 can be completely removed in a function +// that doesn't access memory. +def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">; +def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">; + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; @@ -1285,23 +1600,23 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; // maximum really 15 on VI? def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { +} + +def S_SLEEP_VAR : SOP1_0_32 <"s_sleep_var", [(int_amdgcn_s_sleep_var SSrc_b32:$src0)]> { let hasSideEffects = 1; } def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_setprio timm:$simm16)]> { - let hasSideEffects = 1; } let Uses = [EXEC, M0] in { def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> { - let hasSideEffects = 1; } def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> { - let hasSideEffects = 1; } } // End Uses = [EXEC, M0] @@ -1316,13 +1631,14 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> { } def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_incperflevel timm:$simm16)]> { - let hasSideEffects = 1; } def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_decperflevel timm:$simm16)]> { - let hasSideEffects = 1; } -def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> { + +let Uses = [M0] in +def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "", + [(int_amdgcn_s_ttracedata M0)]> { let simm16 = 0; let fixed_imm = 1; } @@ -1366,8 +1682,10 @@ let SubtargetPredicate = isGFX10Plus in { [(SIdenorm_mode (i32 timm:$simm16))]>; } + let hasSideEffects = 1 in def S_TTRACEDATA_IMM : - SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_s_ttracedata_imm timm:$simm16)]>; } // End SubtargetPredicate = isGFX10Plus let SubtargetPredicate = isGFX11Plus in { @@ -1379,6 +1697,11 @@ let SubtargetPredicate = isGFX11Plus in { "$simm16">; } // End SubtargetPredicate = isGFX11Plus +let SubtargetPredicate = HasVGPRSingleUseHintInsts in { + def S_SINGLEUSE_VDST : + SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">; +} // End SubtargetPredicate = HasVGPRSingeUseHintInsts + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -1421,10 +1744,91 @@ def : GCNPat < (S_WAIT_EVENT (i16 0)) >; +// The first 10 bits of the mode register are the core FP mode on all +// subtargets. +// +// The high bits include additional fields, intermixed with some +// non-floating point environment information. We extract the full +// register and clear non-relevant bits. +// +// EXCP_EN covers floating point exceptions, but also some other +// non-FP exceptions. +// +// Bits 12-18 cover the relevant exception mask on all subtargets. +// +// FIXME: Bit 18 is int_div0, should this be in the FP environment? I +// think the only source is v_rcp_iflag_i32. +// +// On GFX9+: +// Bit 23 is the additional FP16_OVFL mode. +// +// Bits 19, 20, and 21 cover non-FP exceptions and differ between +// gfx9/10/11, so we ignore them here. + +// TODO: Would it be cheaper to emit multiple s_getreg_b32 calls for +// the ranges and combine the results? + +defvar fp_round_mask = !add(!shl(1, 4), -1); +defvar fp_denorm_mask = !shl(!add(!shl(1, 4), -1), 4); +defvar dx10_clamp_mask = !shl(1, 8); +defvar ieee_mode_mask = !shl(1, 9); + +// Covers fp_round, fp_denorm, dx10_clamp, and IEEE bit. +defvar fpmode_mask = + !or(fp_round_mask, fp_denorm_mask, dx10_clamp_mask, ieee_mode_mask); + +defvar fp_excp_en_mask = !shl(!add(!shl(1, 7), -1), 12); +defvar fp16_ovfl = !shl(1, 23); +defvar fpmode_mask_gfx6plus = !or(fpmode_mask, fp_excp_en_mask); +defvar fpmode_mask_gfx9plus = !or(fpmode_mask_gfx6plus, fp16_ovfl); + +class GetFPModePat<int fpmode_mask> : GCNPat< + (i32 get_fpmode), + (S_AND_B32 (i32 fpmode_mask), + (S_GETREG_B32 getHwRegImm< + HWREG.MODE, 0, + !add(!logtwo(fpmode_mask), 1)>.ret)) +>; + +// TODO: Might be worth moving to custom lowering so the and is +// exposed to demanded bits optimizations. Most users probably only +// care about the rounding or denorm mode bits. We also can reduce the +// demanded read from the getreg immediate. +let SubtargetPredicate = isGFX9Plus in { +// Last bit = FP16_OVFL +def : GetFPModePat<fpmode_mask_gfx9plus>; +} + +// Last bit = EXCP_EN.int_div0 +let SubtargetPredicate = isNotGFX9Plus in { +def : GetFPModePat<fpmode_mask_gfx6plus>; +} + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// +def UniformSelect : PatFrag< + (ops node:$src0, node:$src1), + (select SCC, $src0, $src1), + [{ return !N->isDivergent(); }] +>; + +let AddedComplexity = 20 in { + def : GCNPat< + (i32 (UniformSelect i32:$src0, i32:$src1)), + (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1) + >; + + // TODO: The predicate should not be necessary, but enabling this pattern for + // all subtargets generates worse code in some cases. + let OtherPredicates = [HasPseudoScalarTrans] in + def : GCNPat< + (f32 (UniformSelect f32:$src0, f32:$src1)), + (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1) + >; +} + // V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector // case, the sgpr-copies pass will fix this to use the vector version. def : GCNPat < @@ -1476,6 +1880,11 @@ def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>; // Target-specific instruction encodings. //===----------------------------------------------------------------------===// +class Select_gfx12<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX12> { + Predicate AssemblerPredicate = isGFX12Only; + string DecoderNamespace = "GFX12"; +} + class Select_gfx11<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX11> { Predicate AssemblerPredicate = isGFX11Only; string DecoderNamespace = "GFX11"; @@ -1497,85 +1906,143 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { } //===----------------------------------------------------------------------===// -// GFX11. +// SOP1 - GFX11, GFX12 //===----------------------------------------------------------------------===// +multiclass SOP1_Real_gfx12<bits<8> op> { + def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, + Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP1_M0_Real_gfx12<bits<8> op> { + def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, + Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic> { + let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0 + } +} + multiclass SOP1_Real_gfx11<bits<8> op> { def _gfx11 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, Select_gfx11<!cast<SOP1_Pseudo>(NAME).Mnemonic>; } +multiclass SOP1_Real_Renamed_gfx12<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> { + def _gfx12 : SOP1_Real<op, backing_pseudo, real_name>, + Select_gfx12<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>; +} + multiclass SOP1_Real_Renamed_gfx11<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> { def _gfx11 : SOP1_Real<op, backing_pseudo, real_name>, Select_gfx11<backing_pseudo.Mnemonic>, - MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; -} - -defm S_MOV_B32 : SOP1_Real_gfx11<0x000>; -defm S_MOV_B64 : SOP1_Real_gfx11<0x001>; -defm S_CMOV_B32 : SOP1_Real_gfx11<0x002>; -defm S_CMOV_B64 : SOP1_Real_gfx11<0x003>; -defm S_BREV_B32 : SOP1_Real_gfx11<0x004>; -defm S_BREV_B64 : SOP1_Real_gfx11<0x005>; -defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">; -defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">; -defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">; -defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">; -defm S_CLS_I32 : SOP1_Real_Renamed_gfx11<0x00c, S_FLBIT_I32, "s_cls_i32">; -defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">; -defm S_SEXT_I32_I8 : SOP1_Real_gfx11<0x00e>; -defm S_SEXT_I32_I16 : SOP1_Real_gfx11<0x00f>; -defm S_BITSET0_B32 : SOP1_Real_gfx11<0x010>; -defm S_BITSET0_B64 : SOP1_Real_gfx11<0x011>; -defm S_BITSET1_B32 : SOP1_Real_gfx11<0x012>; -defm S_BITSET1_B64 : SOP1_Real_gfx11<0x013>; -defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11<0x014>; -defm S_ABS_I32 : SOP1_Real_gfx11<0x015>; -defm S_BCNT0_I32_B32 : SOP1_Real_gfx11<0x016>; -defm S_BCNT0_I32_B64 : SOP1_Real_gfx11<0x017>; -defm S_BCNT1_I32_B32 : SOP1_Real_gfx11<0x018>; -defm S_BCNT1_I32_B64 : SOP1_Real_gfx11<0x019>; -defm S_QUADMASK_B32 : SOP1_Real_gfx11<0x01a>; -defm S_QUADMASK_B64 : SOP1_Real_gfx11<0x01b>; -defm S_WQM_B32 : SOP1_Real_gfx11<0x01c>; -defm S_WQM_B64 : SOP1_Real_gfx11<0x01d>; -defm S_NOT_B32 : SOP1_Real_gfx11<0x01e>; -defm S_NOT_B64 : SOP1_Real_gfx11<0x01f>; -defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x020>; -defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x021>; -defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x022>; -defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x023>; -defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x024>; -defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x025>; -defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x026>; -defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x027>; -defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x028>; -defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x029>; -defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x02a>; -/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x02b>; //same as older arch, handled there*/ -defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">; -defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">; -defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">; -defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">; -defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">; -defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">; -defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">; -defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">; -defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">; -defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">; -defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">; -defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">; -defm S_MOVRELS_B32 : SOP1_Real_gfx11<0x040>; -defm S_MOVRELS_B64 : SOP1_Real_gfx11<0x041>; -defm S_MOVRELD_B32 : SOP1_Real_gfx11<0x042>; -defm S_MOVRELD_B64 : SOP1_Real_gfx11<0x043>; -defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11<0x044>; -defm S_GETPC_B64 : SOP1_Real_gfx11<0x047>; -defm S_SETPC_B64 : SOP1_Real_gfx11<0x048>; -defm S_SWAPPC_B64 : SOP1_Real_gfx11<0x049>; -defm S_RFE_B64 : SOP1_Real_gfx11<0x04a>; -defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11<0x04c>; -defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11<0x04d>; + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>; +} + +multiclass SOP1_Real_gfx11_gfx12<bits<8> op> : + SOP1_Real_gfx11<op>, SOP1_Real_gfx12<op>; + +multiclass SOP1_Real_Renamed_gfx11_gfx12<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> : + SOP1_Real_Renamed_gfx11<op, backing_pseudo, real_name>, + SOP1_Real_Renamed_gfx12<op, backing_pseudo, real_name>; + +defm S_MOV_B32 : SOP1_Real_gfx11_gfx12<0x000>; +defm S_MOV_B64 : SOP1_Real_gfx11_gfx12<0x001>; +defm S_CMOV_B32 : SOP1_Real_gfx11_gfx12<0x002>; +defm S_CMOV_B64 : SOP1_Real_gfx11_gfx12<0x003>; +defm S_BREV_B32 : SOP1_Real_gfx11_gfx12<0x004>; +defm S_BREV_B64 : SOP1_Real_gfx11_gfx12<0x005>; +defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">; +defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">; +defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11_gfx12<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">; +defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11_gfx12<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">; +defm S_CLS_I32 : SOP1_Real_Renamed_gfx11_gfx12<0x00c, S_FLBIT_I32, "s_cls_i32">; +defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11_gfx12<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">; +defm S_SEXT_I32_I8 : SOP1_Real_gfx11_gfx12<0x00e>; +defm S_SEXT_I32_I16 : SOP1_Real_gfx11_gfx12<0x00f>; +defm S_BITSET0_B32 : SOP1_Real_gfx11_gfx12<0x010>; +defm S_BITSET0_B64 : SOP1_Real_gfx11_gfx12<0x011>; +defm S_BITSET1_B32 : SOP1_Real_gfx11_gfx12<0x012>; +defm S_BITSET1_B64 : SOP1_Real_gfx11_gfx12<0x013>; +defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11_gfx12<0x014>; +defm S_ABS_I32 : SOP1_Real_gfx11_gfx12<0x015>; +defm S_BCNT0_I32_B32 : SOP1_Real_gfx11_gfx12<0x016>; +defm S_BCNT0_I32_B64 : SOP1_Real_gfx11_gfx12<0x017>; +defm S_BCNT1_I32_B32 : SOP1_Real_gfx11_gfx12<0x018>; +defm S_BCNT1_I32_B64 : SOP1_Real_gfx11_gfx12<0x019>; +defm S_QUADMASK_B32 : SOP1_Real_gfx11_gfx12<0x01a>; +defm S_QUADMASK_B64 : SOP1_Real_gfx11_gfx12<0x01b>; +defm S_WQM_B32 : SOP1_Real_gfx11_gfx12<0x01c>; +defm S_WQM_B64 : SOP1_Real_gfx11_gfx12<0x01d>; +defm S_NOT_B32 : SOP1_Real_gfx11_gfx12<0x01e>; +defm S_NOT_B64 : SOP1_Real_gfx11_gfx12<0x01f>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x020>; +defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x021>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x022>; +defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x023>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x024>; +defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x025>; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x026>; +defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x027>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x028>; +defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x029>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x02a>; +/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x02b>; //same as older arch, handled there*/ +defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">; +defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">; +defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">; +defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">; +defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">; +defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">; +defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">; +defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">; +defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">; +defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">; +defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">; +defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">; +defm S_MOVRELS_B32 : SOP1_Real_gfx11_gfx12<0x040>; +defm S_MOVRELS_B64 : SOP1_Real_gfx11_gfx12<0x041>; +defm S_MOVRELD_B32 : SOP1_Real_gfx11_gfx12<0x042>; +defm S_MOVRELD_B64 : SOP1_Real_gfx11_gfx12<0x043>; +defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11_gfx12<0x044>; +defm S_GETPC_B64 : SOP1_Real_gfx11_gfx12<0x047>; +defm S_SETPC_B64 : SOP1_Real_gfx11_gfx12<0x048>; +defm S_SWAPPC_B64 : SOP1_Real_gfx11_gfx12<0x049>; +defm S_RFE_B64 : SOP1_Real_gfx11_gfx12<0x04a>; +defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>; +defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; +defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; +defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; +defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>; +defm S_BARRIER_SIGNAL_IMM : SOP1_Real_gfx12<0x04e>; +defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Real_gfx12<0x04f>; +defm S_GET_BARRIER_STATE_IMM : SOP1_Real_gfx12<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_Real_gfx12<0x052>; +defm S_WAKEUP_BARRIER_IMM : SOP1_Real_gfx12<0x057>; +defm S_SLEEP_VAR : SOP1_Real_gfx12<0x058>; + +//===----------------------------------------------------------------------===// +// SOP1 - GFX1150, GFX12 +//===----------------------------------------------------------------------===// + +defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12<0x060>; +defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12<0x061>; +defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12<0x062>; +defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12<0x063>; +defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12<0x064>; +defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12<0x065>; +defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12<0x066>; +defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12<0x067>; +defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12<0x068>; +defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12<0x069>; +defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12<0x06a>; +defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12<0x06b>; +defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12<0x06c>; +defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12<0x06d>; +defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>; //===----------------------------------------------------------------------===// // SOP1 - GFX10. @@ -1587,8 +2054,8 @@ multiclass SOP1_Real_gfx10<bits<8> op> { Select_gfx10<ps.Mnemonic>; } -multiclass SOP1_Real_gfx10_gfx11<bits<8> op> : - SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>; +multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> : + SOP1_Real_gfx10<op>, SOP1_Real_gfx11_gfx12<op>; defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; @@ -1623,8 +2090,8 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>; -multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> : - SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11<op>; +multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11_gfx12<op>; defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>; @@ -1667,7 +2134,7 @@ defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>; defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>; defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>; defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>; defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>; defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>; defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>; @@ -1677,63 +2144,142 @@ defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>; defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; //===----------------------------------------------------------------------===// -// SOP2 - GFX11. +// SOP2 - GFX12 +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_gfx12<bits<7> op> { + def _gfx12 : SOP2_Real32<op, !cast<SOP2_Pseudo>(NAME)>, + Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP2_Real_Renamed_gfx12<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> { + def _gfx12 : SOP2_Real32<op, backing_pseudo, real_name>, + Select_gfx12<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>; +} + +defm S_MIN_NUM_F32 : SOP2_Real_Renamed_gfx12<0x042, S_MIN_F32, "s_min_num_f32">; +defm S_MAX_NUM_F32 : SOP2_Real_Renamed_gfx12<0x043, S_MAX_F32, "s_max_num_f32">; +defm S_MIN_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04b, S_MIN_F16, "s_min_num_f16">; +defm S_MAX_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04c, S_MAX_F16, "s_max_num_f16">; +defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>; +defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>; +defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>; +defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>; + +defm S_ADD_CO_U32 : SOP2_Real_Renamed_gfx12<0x000, S_ADD_U32, "s_add_co_u32">; +defm S_SUB_CO_U32 : SOP2_Real_Renamed_gfx12<0x001, S_SUB_U32, "s_sub_co_u32">; +defm S_ADD_CO_I32 : SOP2_Real_Renamed_gfx12<0x002, S_ADD_I32, "s_add_co_i32">; +defm S_SUB_CO_I32 : SOP2_Real_Renamed_gfx12<0x003, S_SUB_I32, "s_sub_co_i32">; +defm S_ADD_CO_CI_U32 : SOP2_Real_Renamed_gfx12<0x004, S_ADDC_U32, "s_add_co_ci_u32">; +defm S_SUB_CO_CI_U32 : SOP2_Real_Renamed_gfx12<0x005, S_SUBB_U32, "s_sub_co_ci_u32">; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX11, GFX12. //===----------------------------------------------------------------------===// multiclass SOP2_Real_gfx11<bits<7> op> { - def _gfx11 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>, + def _gfx11 : SOP2_Real32<op, !cast<SOP2_Pseudo>(NAME)>, Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>; } multiclass SOP2_Real_Renamed_gfx11<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> { - def _gfx11 : SOP2_Real<op, backing_pseudo, real_name>, + def _gfx11 : SOP2_Real32<op, backing_pseudo, real_name>, Select_gfx11<backing_pseudo.Mnemonic>, - MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; -} - -defm S_ABSDIFF_I32 : SOP2_Real_gfx11<0x006>; -defm S_LSHL_B32 : SOP2_Real_gfx11<0x008>; -defm S_LSHL_B64 : SOP2_Real_gfx11<0x009>; -defm S_LSHR_B32 : SOP2_Real_gfx11<0x00a>; -defm S_LSHR_B64 : SOP2_Real_gfx11<0x00b>; -defm S_ASHR_I32 : SOP2_Real_gfx11<0x00c>; -defm S_ASHR_I64 : SOP2_Real_gfx11<0x00d>; -defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11<0x00e>; -defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11<0x00f>; -defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11<0x010>; -defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11<0x011>; -defm S_MIN_I32 : SOP2_Real_gfx11<0x012>; -defm S_MIN_U32 : SOP2_Real_gfx11<0x013>; -defm S_MAX_I32 : SOP2_Real_gfx11<0x014>; -defm S_MAX_U32 : SOP2_Real_gfx11<0x015>; -defm S_AND_B32 : SOP2_Real_gfx11<0x016>; -defm S_AND_B64 : SOP2_Real_gfx11<0x017>; -defm S_OR_B32 : SOP2_Real_gfx11<0x018>; -defm S_OR_B64 : SOP2_Real_gfx11<0x019>; -defm S_XOR_B32 : SOP2_Real_gfx11<0x01a>; -defm S_XOR_B64 : SOP2_Real_gfx11<0x01b>; -defm S_NAND_B32 : SOP2_Real_gfx11<0x01c>; -defm S_NAND_B64 : SOP2_Real_gfx11<0x01d>; -defm S_NOR_B32 : SOP2_Real_gfx11<0x01e>; -defm S_NOR_B64 : SOP2_Real_gfx11<0x01f>; -defm S_XNOR_B32 : SOP2_Real_gfx11<0x020>; -defm S_XNOR_B64 : SOP2_Real_gfx11<0x021>; -defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x022, S_ANDN2_B32, "s_and_not1_b32">; -defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x023, S_ANDN2_B64, "s_and_not1_b64">; -defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x024, S_ORN2_B32, "s_or_not1_b32">; -defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x025, S_ORN2_B64, "s_or_not1_b64">; -defm S_BFE_U32 : SOP2_Real_gfx11<0x026>; -defm S_BFE_I32 : SOP2_Real_gfx11<0x027>; -defm S_BFE_U64 : SOP2_Real_gfx11<0x028>; -defm S_BFE_I64 : SOP2_Real_gfx11<0x029>; -defm S_BFM_B32 : SOP2_Real_gfx11<0x02a>; -defm S_BFM_B64 : SOP2_Real_gfx11<0x02b>; -defm S_MUL_I32 : SOP2_Real_gfx11<0x02c>; -defm S_MUL_HI_U32 : SOP2_Real_gfx11<0x02d>; -defm S_MUL_HI_I32 : SOP2_Real_gfx11<0x02e>; -defm S_CSELECT_B32 : SOP2_Real_gfx11<0x030>; -defm S_CSELECT_B64 : SOP2_Real_gfx11<0x031>; -defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>; + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>; +} + +multiclass SOP2_Real_gfx11_gfx12<bits<7> op> : + SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>; + +multiclass SOP2_Real_Renamed_gfx11_gfx12<bits<8> op, SOP2_Pseudo backing_pseudo, string real_name> : + SOP2_Real_Renamed_gfx11<op, backing_pseudo, real_name>, + SOP2_Real_Renamed_gfx12<op, backing_pseudo, real_name>; + +defm S_ABSDIFF_I32 : SOP2_Real_gfx11_gfx12<0x006>; +defm S_LSHL_B32 : SOP2_Real_gfx11_gfx12<0x008>; +defm S_LSHL_B64 : SOP2_Real_gfx11_gfx12<0x009>; +defm S_LSHR_B32 : SOP2_Real_gfx11_gfx12<0x00a>; +defm S_LSHR_B64 : SOP2_Real_gfx11_gfx12<0x00b>; +defm S_ASHR_I32 : SOP2_Real_gfx11_gfx12<0x00c>; +defm S_ASHR_I64 : SOP2_Real_gfx11_gfx12<0x00d>; +defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11_gfx12<0x00e>; +defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11_gfx12<0x00f>; +defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11_gfx12<0x010>; +defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11_gfx12<0x011>; +defm S_MIN_I32 : SOP2_Real_gfx11_gfx12<0x012>; +defm S_MIN_U32 : SOP2_Real_gfx11_gfx12<0x013>; +defm S_MAX_I32 : SOP2_Real_gfx11_gfx12<0x014>; +defm S_MAX_U32 : SOP2_Real_gfx11_gfx12<0x015>; +defm S_AND_B32 : SOP2_Real_gfx11_gfx12<0x016>; +defm S_AND_B64 : SOP2_Real_gfx11_gfx12<0x017>; +defm S_OR_B32 : SOP2_Real_gfx11_gfx12<0x018>; +defm S_OR_B64 : SOP2_Real_gfx11_gfx12<0x019>; +defm S_XOR_B32 : SOP2_Real_gfx11_gfx12<0x01a>; +defm S_XOR_B64 : SOP2_Real_gfx11_gfx12<0x01b>; +defm S_NAND_B32 : SOP2_Real_gfx11_gfx12<0x01c>; +defm S_NAND_B64 : SOP2_Real_gfx11_gfx12<0x01d>; +defm S_NOR_B32 : SOP2_Real_gfx11_gfx12<0x01e>; +defm S_NOR_B64 : SOP2_Real_gfx11_gfx12<0x01f>; +defm S_XNOR_B32 : SOP2_Real_gfx11_gfx12<0x020>; +defm S_XNOR_B64 : SOP2_Real_gfx11_gfx12<0x021>; +defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11_gfx12<0x022, S_ANDN2_B32, "s_and_not1_b32">; +defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11_gfx12<0x023, S_ANDN2_B64, "s_and_not1_b64">; +defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11_gfx12<0x024, S_ORN2_B32, "s_or_not1_b32">; +defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11_gfx12<0x025, S_ORN2_B64, "s_or_not1_b64">; +defm S_BFE_U32 : SOP2_Real_gfx11_gfx12<0x026>; +defm S_BFE_I32 : SOP2_Real_gfx11_gfx12<0x027>; +defm S_BFE_U64 : SOP2_Real_gfx11_gfx12<0x028>; +defm S_BFE_I64 : SOP2_Real_gfx11_gfx12<0x029>; +defm S_BFM_B32 : SOP2_Real_gfx11_gfx12<0x02a>; +defm S_BFM_B64 : SOP2_Real_gfx11_gfx12<0x02b>; +defm S_MUL_I32 : SOP2_Real_gfx11_gfx12<0x02c>; +defm S_MUL_HI_U32 : SOP2_Real_gfx11_gfx12<0x02d>; +defm S_MUL_HI_I32 : SOP2_Real_gfx11_gfx12<0x02e>; +defm S_CSELECT_B32 : SOP2_Real_gfx11_gfx12<0x030>; +defm S_CSELECT_B64 : SOP2_Real_gfx11_gfx12<0x031>; +defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11_gfx12<0x035>; +defm S_ADD_NC_U64 : SOP2_Real_Renamed_gfx12<0x053, S_ADD_U64, "s_add_nc_u64">; +defm S_SUB_NC_U64 : SOP2_Real_Renamed_gfx12<0x054, S_SUB_U64, "s_sub_nc_u64">; +defm S_MUL_U64 : SOP2_Real_gfx12<0x055>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX1150, GFX12 +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_FMAK_gfx12<bits<7> op> { + def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, + Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP2_Real_FMAK_gfx11<bits<7> op> { + def _gfx11 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>, + Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> : + SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>; + +defm S_ADD_F32 : SOP2_Real_gfx11_gfx12<0x040>; +defm S_SUB_F32 : SOP2_Real_gfx11_gfx12<0x041>; +defm S_MUL_F32 : SOP2_Real_gfx11_gfx12<0x044>; +defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x045>; +defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x046>; +defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12<0x047>; +defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12<0x048>; +defm S_ADD_F16 : SOP2_Real_gfx11_gfx12<0x049>; +defm S_SUB_F16 : SOP2_Real_gfx11_gfx12<0x04a>; +defm S_MUL_F16 : SOP2_Real_gfx11_gfx12<0x04d>; +defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12<0x04e>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX1150 +//===----------------------------------------------------------------------===// + +defm S_MIN_F32 : SOP2_Real_gfx11<0x042>; +defm S_MAX_F32 : SOP2_Real_gfx11<0x043>; +defm S_MIN_F16 : SOP2_Real_gfx11<0x04b>; +defm S_MAX_F16 : SOP2_Real_gfx11<0x04c>; //===----------------------------------------------------------------------===// // SOP2 - GFX10. @@ -1741,20 +2287,20 @@ defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>; multiclass SOP2_Real_gfx10<bits<7> op> { defvar ps = !cast<SOP2_Pseudo>(NAME); - def _gfx10 : SOP2_Real<op, ps>, + def _gfx10 : SOP2_Real32<op, ps>, Select_gfx10<ps.Mnemonic>; } -multiclass SOP2_Real_gfx10_gfx11<bits<7> op> : - SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>; +multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> : + SOP2_Real_gfx10<op>, SOP2_Real_gfx11_gfx12<op>; defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>; defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>; defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>; -defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11<0x032>; -defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11<0x033>; -defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11<0x034>; +defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x032>; +defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x033>; +defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x034>; defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>; defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; @@ -1764,7 +2310,7 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { defvar ps = !cast<SOP_Pseudo>(NAME); - def _gfx6_gfx7 : SOP2_Real<op, ps>, + def _gfx6_gfx7 : SOP2_Real32<op, ps>, Select_gfx6_gfx7<ps.Mnemonic>; } @@ -1772,7 +2318,10 @@ multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>; multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<7> op> : - SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11<op>; + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>; + +multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<7> op> : + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11_gfx12<op>; defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>; @@ -1820,29 +2369,52 @@ defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>; defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; //===----------------------------------------------------------------------===// -// SOPK - GFX11. +// SOPK - GFX11, GFX12. //===----------------------------------------------------------------------===// +multiclass SOPK_Real32_gfx12<bits<5> op> { + def _gfx12 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOPK_Real32_Renamed_gfx12<bits<5> op, SOPK_Pseudo backing_pseudo, string real_name> { + def _gfx12 : SOPK_Real32<op, backing_pseudo, real_name>, + Select_gfx12<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>; +} + multiclass SOPK_Real32_gfx11<bits<5> op> { def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; } +multiclass SOPK_Real64_gfx12<bits<5> op> { + def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + multiclass SOPK_Real64_gfx11<bits<5> op> { def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; } -defm S_GETREG_B32 : SOPK_Real32_gfx11<0x011>; -defm S_SETREG_B32 : SOPK_Real32_gfx11<0x012>; -defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11<0x013>; -defm S_CALL_B64 : SOPK_Real32_gfx11<0x014>; +multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> : + SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>; + +multiclass SOPK_Real64_gfx11_gfx12<bits<5> op> : + SOPK_Real64_gfx11<op>, SOPK_Real64_gfx12<op>; + +defm S_ADDK_CO_I32 : SOPK_Real32_Renamed_gfx12<0x00f, S_ADDK_I32, "s_addk_co_i32">; +defm S_GETREG_B32 : SOPK_Real32_gfx11_gfx12<0x011>; +defm S_SETREG_B32 : SOPK_Real32_gfx11_gfx12<0x012>; +defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>; +defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>; defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>; defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>; -defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>; -defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>; -defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>; -defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11_gfx12<0x018>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11_gfx12<0x019>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11_gfx12<0x01a>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11_gfx12<0x01b>; //===----------------------------------------------------------------------===// // SOPK - GFX10. @@ -1863,7 +2435,10 @@ multiclass SOPK_Real64_gfx10<bits<5> op> { multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> : SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>; -defm S_VERSION : SOPK_Real32_gfx10_gfx11<0x001>; +multiclass SOPK_Real32_gfx10_gfx11_gfx12<bits<5> op> : + SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11_gfx12<op>; + +defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12<0x001>; defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>; defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; @@ -1897,10 +2472,13 @@ multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> : multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> : SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>; +multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<bits<5> op> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11_gfx12<op>; + defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; -defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x000>; -defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x002>; +defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>; +defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>; defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>; defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>; defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>; @@ -1914,21 +2492,48 @@ defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>; defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>; defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>; defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00f>; -defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x010>; +defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x010>; defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>; defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>; defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; //===----------------------------------------------------------------------===// -// SOPP - GFX11 +// SOPP - GFX12 only. //===----------------------------------------------------------------------===// +multiclass SOPP_Real_32_gfx12<bits<7> op> { + def _gfx12 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">; +} + +multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> { + def _gfx12 : SOPP_Real_32<op, backing_pseudo, real_name>, + Select_gfx12<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>; +} + +defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">; +defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; +defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX11, GFX12. +//===----------------------------------------------------------------------===// + + multiclass SOPP_Real_32_gfx11<bits<7> op> { def _gfx11 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>, SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">; } +multiclass SOPP_Real_64_gfx12<bits<7> op> { + def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, + Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">; +} + multiclass SOPP_Real_64_gfx11<bits<7> op> { def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>, Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>, @@ -1938,7 +2543,22 @@ multiclass SOPP_Real_64_gfx11<bits<7> op> { multiclass SOPP_Real_32_Renamed_gfx11<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> { def _gfx11 : SOPP_Real_32<op, backing_pseudo, real_name>, Select_gfx11<backing_pseudo.Mnemonic>, - MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>; +} + +multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> : + SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>; + +multiclass SOPP_Real_64_gfx11_gfx12<bits<7> op> : + SOPP_Real_64_gfx11<op>, SOPP_Real_64_gfx12<op>; + +multiclass SOPP_Real_32_Renamed_gfx11_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> : + SOPP_Real_32_Renamed_gfx11<op, backing_pseudo, real_name>, + SOPP_Real_32_Renamed_gfx12<op, backing_pseudo, real_name>; + +multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> { + defm "" : SOPP_Real_32_gfx12<op>; + defm _pad_s_nop : SOPP_Real_64_gfx12<op>; } multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> { @@ -1946,42 +2566,51 @@ multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> { defm _pad_s_nop : SOPP_Real_64_gfx11<op>; } -defm S_SETKILL : SOPP_Real_32_gfx11<0x001>; -defm S_SETHALT : SOPP_Real_32_gfx11<0x002>; -defm S_SLEEP : SOPP_Real_32_gfx11<0x003>; -defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">; -defm S_CLAUSE : SOPP_Real_32_gfx11<0x005>; -defm S_DELAY_ALU : SOPP_Real_32_gfx11<0x007>; +multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> : + SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>; + +defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>; +defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>; +defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>; +defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11_gfx12<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">; +defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>; +defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>; defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>; -defm S_WAITCNT : SOPP_Real_32_gfx11<0x009>; -defm S_WAIT_IDLE : SOPP_Real_32_gfx11<0x00a>; -defm S_WAIT_EVENT : SOPP_Real_32_gfx11<0x00b>; -defm S_TRAP : SOPP_Real_32_gfx11<0x010>; -defm S_ROUND_MODE : SOPP_Real_32_gfx11<0x011>; -defm S_DENORM_MODE : SOPP_Real_32_gfx11<0x012>; -defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11<0x020>; -defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11<0x021>; -defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11<0x022>; -defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11<0x023>; -defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11<0x024>; -defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11<0x025>; -defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11<0x026>; +defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>; +defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>; +defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>; +defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>; +defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>; +defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>; +defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>; +defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>; +defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>; +defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>; +defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>; +defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>; +defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>; defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>; defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>; defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>; defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>; -defm S_ENDPGM : SOPP_Real_32_gfx11<0x030>; -defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11<0x031>; -defm S_WAKEUP : SOPP_Real_32_gfx11<0x034>; -defm S_SETPRIO : SOPP_Real_32_gfx11<0x035>; -defm S_SENDMSG : SOPP_Real_32_gfx11<0x036>; -defm S_SENDMSGHALT : SOPP_Real_32_gfx11<0x037>; -defm S_INCPERFLEVEL : SOPP_Real_32_gfx11<0x038>; -defm S_DECPERFLEVEL : SOPP_Real_32_gfx11<0x039>; -defm S_TTRACEDATA : SOPP_Real_32_gfx11<0x03a>; -defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11<0x03b>; -defm S_ICACHE_INV : SOPP_Real_32_gfx11<0x03c>; -defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>; +defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>; +defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>; +defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>; +defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>; +defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>; +defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>; +defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>; +defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>; +defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>; +defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>; +defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>; +defm S_BARRIER : SOPP_Real_32_gfx11_gfx12<0x03d>; + +//===----------------------------------------------------------------------===// +// SOPP - GFX1150, GFX12. +//===----------------------------------------------------------------------===// + +defm S_SINGLEUSE_VDST : SOPP_Real_32_gfx11_gfx12<0x013>; //===----------------------------------------------------------------------===// // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10 @@ -2017,11 +2646,11 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op> : multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>; -multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op> : - SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11<op>; +multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> : + SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>; -multiclass SOPP_Real_32_gfx10_gfx11<bits<7> op> : - SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11<op>; +multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> : + SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>; //64 bit encodings, for Relaxation multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> { @@ -2054,8 +2683,8 @@ multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op> : multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>; -multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op> : - SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_64_gfx11<op>; +multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> : + SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_64_gfx11_gfx12<op>; //relaxation for insts with no operands not implemented multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> { @@ -2063,7 +2692,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> { defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>; } -defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<0x000>; +defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x000>; defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001>; defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>; defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>; @@ -2083,7 +2712,7 @@ defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>; defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>; defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>; defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>; -defm S_CODE_END : SOPP_Real_32_gfx10_gfx11<0x01f>; +defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12<0x01f>; defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>; defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>; defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>; @@ -2107,32 +2736,74 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_ } //===----------------------------------------------------------------------===// -// SOPC - GFX11 +// SOPC - GFX11, GFX12. //===----------------------------------------------------------------------===// +multiclass SOPC_Real_gfx12<bits<7> op> { + def _gfx12 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, + Select_gfx12<!cast<SOPC_Pseudo>(NAME).Mnemonic>; +} + multiclass SOPC_Real_gfx11<bits<7> op> { def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>; } -defm S_CMP_EQ_I32 : SOPC_Real_gfx11<0x00>; -defm S_CMP_LG_I32 : SOPC_Real_gfx11<0x01>; -defm S_CMP_GT_I32 : SOPC_Real_gfx11<0x02>; -defm S_CMP_GE_I32 : SOPC_Real_gfx11<0x03>; -defm S_CMP_LT_I32 : SOPC_Real_gfx11<0x04>; -defm S_CMP_LE_I32 : SOPC_Real_gfx11<0x05>; -defm S_CMP_EQ_U32 : SOPC_Real_gfx11<0x06>; -defm S_CMP_LG_U32 : SOPC_Real_gfx11<0x07>; -defm S_CMP_GT_U32 : SOPC_Real_gfx11<0x08>; -defm S_CMP_GE_U32 : SOPC_Real_gfx11<0x09>; -defm S_CMP_LT_U32 : SOPC_Real_gfx11<0x0a>; -defm S_CMP_LE_U32 : SOPC_Real_gfx11<0x0b>; -defm S_BITCMP0_B32 : SOPC_Real_gfx11<0x0c>; -defm S_BITCMP1_B32 : SOPC_Real_gfx11<0x0d>; -defm S_BITCMP0_B64 : SOPC_Real_gfx11<0x0e>; -defm S_BITCMP1_B64 : SOPC_Real_gfx11<0x0f>; -defm S_CMP_EQ_U64 : SOPC_Real_gfx11<0x10>; -defm S_CMP_LG_U64 : SOPC_Real_gfx11<0x11>; +multiclass SOPC_Real_gfx11_gfx12<bits<7> op> : + SOPC_Real_gfx11<op>, SOPC_Real_gfx12<op>; + +defm S_CMP_EQ_I32 : SOPC_Real_gfx11_gfx12<0x00>; +defm S_CMP_LG_I32 : SOPC_Real_gfx11_gfx12<0x01>; +defm S_CMP_GT_I32 : SOPC_Real_gfx11_gfx12<0x02>; +defm S_CMP_GE_I32 : SOPC_Real_gfx11_gfx12<0x03>; +defm S_CMP_LT_I32 : SOPC_Real_gfx11_gfx12<0x04>; +defm S_CMP_LE_I32 : SOPC_Real_gfx11_gfx12<0x05>; +defm S_CMP_EQ_U32 : SOPC_Real_gfx11_gfx12<0x06>; +defm S_CMP_LG_U32 : SOPC_Real_gfx11_gfx12<0x07>; +defm S_CMP_GT_U32 : SOPC_Real_gfx11_gfx12<0x08>; +defm S_CMP_GE_U32 : SOPC_Real_gfx11_gfx12<0x09>; +defm S_CMP_LT_U32 : SOPC_Real_gfx11_gfx12<0x0a>; +defm S_CMP_LE_U32 : SOPC_Real_gfx11_gfx12<0x0b>; +defm S_BITCMP0_B32 : SOPC_Real_gfx11_gfx12<0x0c>; +defm S_BITCMP1_B32 : SOPC_Real_gfx11_gfx12<0x0d>; +defm S_BITCMP0_B64 : SOPC_Real_gfx11_gfx12<0x0e>; +defm S_BITCMP1_B64 : SOPC_Real_gfx11_gfx12<0x0f>; +defm S_CMP_EQ_U64 : SOPC_Real_gfx11_gfx12<0x10>; +defm S_CMP_LG_U64 : SOPC_Real_gfx11_gfx12<0x11>; + +//===----------------------------------------------------------------------===// +// SOPC - GFX1150, GFX12 +//===----------------------------------------------------------------------===// + +defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12<0x41>; +defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12<0x42>; +defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12<0x43>; +defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12<0x44>; +defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12<0x45>; +defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12<0x46>; +defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12<0x47>; +defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12<0x48>; +defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12<0x49>; +defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12<0x4a>; +defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12<0x4b>; +defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12<0x4c>; +defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12<0x4d>; +defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12<0x4e>; + +defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12<0x51>; +defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12<0x52>; +defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12<0x53>; +defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12<0x54>; +defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12<0x55>; +defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12<0x56>; +defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12<0x57>; +defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12<0x58>; +defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12<0x59>; +defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12<0x5a>; +defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12<0x5b>; +defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12<0x5c>; +defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12<0x5d>; +defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>; //===----------------------------------------------------------------------===// // SOPC - GFX6, GFX7, GFX8, GFX9, GFX10 @@ -2194,9 +2865,8 @@ class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> : SOP1_Real<op, ps>, Select_vi<ps.Mnemonic>; - class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> : - SOP2_Real<op, ps>, + SOP2_Real32<op, ps>, Select_vi<ps.Mnemonic>; class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> : diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index ce40d82021cf..23434d2de0fc 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -36,14 +36,15 @@ namespace SendMsg { // Disable lint checking for this block since it makes the table unreadable. // NOLINTBEGIN +// clang-format off const CustomOperand<const MCSubtargetInfo &> Msg[] = { {{""}}, {{"MSG_INTERRUPT"}, ID_INTERRUPT}, {{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus}, {{"MSG_GS_DONE"}, ID_GS_DONE_PreGFX11, isNotGFX11Plus}, {{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10}, - {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus}, - {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus}, + {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9_GFX10_GFX11}, + {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9_GFX10_GFX11}, {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9_GFX10}, {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10}, {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus}, @@ -59,7 +60,9 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = { {{"MSG_RTN_GET_REALTIME"}, ID_RTN_GET_REALTIME, isGFX11Plus}, {{"MSG_RTN_SAVE_WAVE"}, ID_RTN_SAVE_WAVE, isGFX11Plus}, {{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus}, + {{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus}, }; +// clang-format on // NOLINTEND const int MSG_SIZE = static_cast<int>( @@ -87,41 +90,56 @@ namespace Hwreg { // Disable lint checking for this block since it makes the table unreadable. // NOLINTBEGIN +// clang-format off const CustomOperand<const MCSubtargetInfo &> Opr[] = { {{""}}, {{"HW_REG_MODE"}, ID_MODE}, {{"HW_REG_STATUS"}, ID_STATUS}, - {{"HW_REG_TRAPSTS"}, ID_TRAPSTS}, + {{"HW_REG_TRAPSTS"}, ID_TRAPSTS, isNotGFX12Plus}, {{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus}, {{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC}, {{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC}, {{"HW_REG_IB_STS"}, ID_IB_STS}, {{""}}, {{""}}, + {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx12, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx12, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx12, isGFX12Plus}, {{""}}, {{""}}, - {{""}}, - {{""}}, - {{""}}, - {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9Plus}, + {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9_GFX10_GFX11}, {{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10}, {{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10}, {{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10}, {{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10}, - {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10Plus}, - {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10Plus}, + {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10_GFX11}, + {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10_GFX11}, {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030}, {{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus}, {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, {{""}}, - {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA, isGFX11Plus}, + {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, {{""}}, - {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding}, - - // Register numbers reused in GFX11+ - {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO, isGFX11Plus}, - {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI, isGFX11Plus}, + {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, + {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, + {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, + {{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, + + // Register numbers reused in GFX11 + {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx11, isGFX11}, + {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx11, isGFX11}, + + // Register numbers reused in GFX12+ + {{"HW_REG_STATE_PRIV"}, ID_STATE_PRIV, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_DATA1"}, ID_PERF_SNAPSHOT_DATA1, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_DATA2"}, ID_PERF_SNAPSHOT_DATA2, isGFX12Plus}, + {{"HW_REG_EXCP_FLAG_PRIV"}, ID_EXCP_FLAG_PRIV, isGFX12Plus}, + {{"HW_REG_EXCP_FLAG_USER"}, ID_EXCP_FLAG_USER, isGFX12Plus}, + {{"HW_REG_TRAP_CTRL"}, ID_TRAP_CTRL, isGFX12Plus}, + {{"HW_REG_SCRATCH_BASE_LO"}, ID_FLAT_SCR_LO, isGFX12Plus}, + {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, + {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, // GFX940 specific registers {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, @@ -133,6 +151,7 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = { // Aliases {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, }; +// clang-format on // NOLINTEND const int OPR_SIZE = static_cast<int>( diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 296ea18b2a8d..0f92a56237ac 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -12,7 +12,6 @@ #include "AMDKernelCodeT.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -119,15 +118,16 @@ namespace llvm { namespace AMDGPU { +/// \returns True if \p STI is AMDHSA. +bool isHsaAbi(const MCSubtargetInfo &STI) { + return STI.getTargetTriple().getOS() == Triple::AMDHSA; +} + std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA) return std::nullopt; switch (AmdhsaCodeObjectVersion) { - case 2: - return ELF::ELFABIVERSION_AMDGPU_HSA_V2; - case 3: - return ELF::ELFABIVERSION_AMDGPU_HSA_V3; case 4: return ELF::ELFABIVERSION_AMDGPU_HSA_V4; case 5: @@ -138,18 +138,6 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { } } -bool isHsaAbiVersion2(const MCSubtargetInfo *STI) { - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) - return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2; - return false; -} - -bool isHsaAbiVersion3(const MCSubtargetInfo *STI) { - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) - return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3; - return false; -} - bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4; @@ -162,11 +150,6 @@ bool isHsaAbiVersion5(const MCSubtargetInfo *STI) { return false; } -bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) { - return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI) || - isHsaAbiVersion5(STI); -} - unsigned getAmdhsaCodeObjectVersion() { return AmdhsaCodeObjectVersion; } @@ -183,8 +166,6 @@ unsigned getCodeObjectVersion(const Module &M) { unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { - case AMDHSA_COV2: - case AMDHSA_COV3: case AMDHSA_COV4: return 48; case AMDHSA_COV5: @@ -198,8 +179,6 @@ unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { // central TD file. unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { - case AMDHSA_COV2: - case AMDHSA_COV3: case AMDHSA_COV4: return 24; case AMDHSA_COV5: @@ -210,8 +189,6 @@ unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) { unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { - case AMDHSA_COV2: - case AMDHSA_COV3: case AMDHSA_COV4: return 32; case AMDHSA_COV5: @@ -222,8 +199,6 @@ unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) { unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { switch (CodeObjectVersion) { - case AMDHSA_COV2: - case AMDHSA_COV3: case AMDHSA_COV4: return 40; case AMDHSA_COV5: @@ -334,6 +309,7 @@ struct VOPDInfo { uint16_t Opcode; uint16_t OpX; uint16_t OpY; + uint16_t Subtarget; }; struct VOPTrue16Info { @@ -468,6 +444,14 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } +unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) { + if (ST.hasFeature(AMDGPU::FeatureGFX12Insts)) + return SIEncodingFamily::GFX12; + if (ST.hasFeature(AMDGPU::FeatureGFX11Insts)) + return SIEncodingFamily::GFX11; + llvm_unreachable("Subtarget generation does not support VOPD!"); +} + CanBeVOPD getCanBeVOPD(unsigned Opc) { const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); if (Info) @@ -495,11 +479,13 @@ bool isMAC(unsigned Opc) { Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_F32_e64_gfx11 || + Opc == AMDGPU::V_FMAC_F32_e64_gfx12 || Opc == AMDGPU::V_FMAC_F32_e64_vi || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 || Opc == AMDGPU::V_FMAC_F16_e64_gfx10 || Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 || + Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 || Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi || Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi || Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi || @@ -510,7 +496,33 @@ bool isPermlane16(unsigned Opc) { return Opc == AMDGPU::V_PERMLANE16_B32_gfx10 || Opc == AMDGPU::V_PERMLANEX16_B32_gfx10 || Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx11 || - Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11; + Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11 || + Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANE16_VAR_B32_e64_gfx12 || + Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; +} + +bool isGenericAtomic(unsigned Opc) { + return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || + Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX || + Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP || + Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG; } bool isTrue16Inst(unsigned Opc) { @@ -535,8 +547,9 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } -int getVOPDFull(unsigned OpX, unsigned OpY) { - const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); +int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily) { + const VOPDInfo *Info = + getVOPDInfoFromComponentOpcodes(OpX, OpY, EncodingFamily); return Info ? Info->Opcode : -1; } @@ -588,13 +601,15 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const { } std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx) const { + std::function<unsigned(unsigned, unsigned)> GetRegIdx, bool SkipSrc) const { auto OpXRegs = getRegIndices(ComponentIndex::X, GetRegIdx); auto OpYRegs = getRegIndices(ComponentIndex::Y, GetRegIdx); + const unsigned CompOprNum = + SkipSrc ? Component::DST_NUM : Component::MAX_OPR_NUM; unsigned CompOprIdx; - for (CompOprIdx = 0; CompOprIdx < Component::MAX_OPR_NUM; ++CompOprIdx) { + for (CompOprIdx = 0; CompOprIdx < CompOprNum; ++CompOprIdx) { unsigned BanksMasks = VOPD_VGPR_BANK_MASKS[CompOprIdx]; if (OpXRegs[CompOprIdx] && OpYRegs[CompOprIdx] && ((OpXRegs[CompOprIdx] & BanksMasks) == @@ -719,9 +734,9 @@ void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) { static TargetIDSetting getTargetIDSettingFromFeatureString(StringRef FeatureString) { - if (FeatureString.endswith("-")) + if (FeatureString.ends_with("-")) return TargetIDSetting::Off; - if (FeatureString.endswith("+")) + if (FeatureString.ends_with("+")) return TargetIDSetting::On; llvm_unreachable("Malformed feature string"); @@ -732,9 +747,9 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { TargetID.split(TargetIDSplit, ':'); for (const auto &FeatureString : TargetIDSplit) { - if (FeatureString.startswith("xnack")) + if (FeatureString.starts_with("xnack")) XnackSetting = getTargetIDSettingFromFeatureString(FeatureString); - if (FeatureString.startswith("sramecc")) + if (FeatureString.starts_with("sramecc")) SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString); } } @@ -765,63 +780,6 @@ std::string AMDGPUTargetID::toString() const { std::string Features; if (STI.getTargetTriple().getOS() == Triple::AMDHSA) { switch (CodeObjectVersion) { - case AMDGPU::AMDHSA_COV2: - // Code object V2 only supported specific processors and had fixed - // settings for the XNACK. - if (Processor == "gfx600") { - } else if (Processor == "gfx601") { - } else if (Processor == "gfx602") { - } else if (Processor == "gfx700") { - } else if (Processor == "gfx701") { - } else if (Processor == "gfx702") { - } else if (Processor == "gfx703") { - } else if (Processor == "gfx704") { - } else if (Processor == "gfx705") { - } else if (Processor == "gfx801") { - if (!isXnackOnOrAny()) - report_fatal_error( - "AMD GPU code object V2 does not support processor " + - Twine(Processor) + " without XNACK"); - } else if (Processor == "gfx802") { - } else if (Processor == "gfx803") { - } else if (Processor == "gfx805") { - } else if (Processor == "gfx810") { - if (!isXnackOnOrAny()) - report_fatal_error( - "AMD GPU code object V2 does not support processor " + - Twine(Processor) + " without XNACK"); - } else if (Processor == "gfx900") { - if (isXnackOnOrAny()) - Processor = "gfx901"; - } else if (Processor == "gfx902") { - if (isXnackOnOrAny()) - Processor = "gfx903"; - } else if (Processor == "gfx904") { - if (isXnackOnOrAny()) - Processor = "gfx905"; - } else if (Processor == "gfx906") { - if (isXnackOnOrAny()) - Processor = "gfx907"; - } else if (Processor == "gfx90c") { - if (isXnackOnOrAny()) - report_fatal_error( - "AMD GPU code object V2 does not support processor " + - Twine(Processor) + " with XNACK being ON or ANY"); - } else { - report_fatal_error( - "AMD GPU code object V2 does not support processor " + - Twine(Processor)); - } - break; - case AMDGPU::AMDHSA_COV3: - // xnack. - if (isXnackOnOrAny()) - Features += "+xnack"; - // In code object v2 and v3, "sramecc" feature was spelled with a - // hyphen ("sram-ecc"). - if (isSramEccOnOrAny()) - Features += "+sram-ecc"; - break; case AMDGPU::AMDHSA_COV4: case AMDGPU::AMDHSA_COV5: // sramecc. @@ -1191,10 +1149,17 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); - AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1); - AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); + if (Version.Major >= 12) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, 0); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF, 0); + } else { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, 1); + } AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); if (Version.Major >= 10) { @@ -1202,10 +1167,10 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0); AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); + amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, 1); } if (AMDGPU::isGFX90A(*STI)) { AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, @@ -1638,7 +1603,7 @@ unsigned getTgtId(const StringRef Name) { if (Val.MaxIndex == 0 && Name == Val.Name) return Val.Tgt; - if (Val.MaxIndex > 0 && Name.startswith(Val.Name)) { + if (Val.MaxIndex > 0 && Name.starts_with(Val.Name)) { StringRef Suffix = Name.drop_front(Val.Name.size()); unsigned Id; @@ -1931,6 +1896,8 @@ bool isShader(CallingConv::ID cc) { case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: case CallingConv::AMDGPU_CS: return true; default: @@ -1968,7 +1935,17 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC) { case CallingConv::AMDGPU_Gfx: return true; default: - return isEntryFunctionCC(CC); + return isEntryFunctionCC(CC) || isChainCC(CC); + } +} + +bool isChainCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + return true; + default: + return false; } } @@ -2001,15 +1978,23 @@ bool hasPackedD16(const MCSubtargetInfo &STI) { !isSI(STI); } -unsigned getNSAMaxSize(const MCSubtargetInfo &STI) { +bool hasGDS(const MCSubtargetInfo &STI) { + return STI.hasFeature(AMDGPU::FeatureGDS); +} + +unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) { auto Version = getIsaVersion(STI.getCPU()); if (Version.Major == 10) return Version.Minor >= 3 ? 13 : 5; if (Version.Major == 11) return 5; + if (Version.Major >= 12) + return HasSampler ? 4 : 5; return 0; } +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; } + bool isSI(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureSouthernIslands); } @@ -2030,6 +2015,10 @@ bool isGFX9_GFX10(const MCSubtargetInfo &STI) { return isGFX9(STI) || isGFX10(STI); } +bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI) { + return isGFX9(STI) || isGFX10(STI) || isGFX11(STI); +} + bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI) { return isVI(STI) || isGFX9(STI) || isGFX10(STI); } @@ -2046,6 +2035,10 @@ bool isGFX10(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureGFX10); } +bool isGFX10_GFX11(const MCSubtargetInfo &STI) { + return isGFX10(STI) || isGFX11(STI); +} + bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI) || isGFX11Plus(STI); } @@ -2055,9 +2048,17 @@ bool isGFX11(const MCSubtargetInfo &STI) { } bool isGFX11Plus(const MCSubtargetInfo &STI) { - return isGFX11(STI); + return isGFX11(STI) || isGFX12Plus(STI); } +bool isGFX12(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX12]; +} + +bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); } + +bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); } + bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); } @@ -2086,6 +2087,10 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureGFX10_3Insts); } +bool isGFX10_3_GFX11(const MCSubtargetInfo &STI) { + return isGFX10_BEncoding(STI) && !isGFX12Plus(STI); +} + bool isGFX90A(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureGFX90AInsts); } @@ -2106,6 +2111,14 @@ bool hasVOPD(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureVOPD); } +bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) { + return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR); +} + +unsigned hasKernargPreload(const MCSubtargetInfo &STI) { + return STI.hasFeature(AMDGPU::FeatureKernargPreload); +} + int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR) { if (has90AInsts && ArgNumAGPR) @@ -2120,6 +2133,10 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { Reg == AMDGPU::SCC; } +bool isHi(unsigned Reg, const MCRegisterInfo &MRI) { + return MRI.getEncodingValue(Reg) & AMDGPU::HWEncoding::IS_HI; +} + #define MAP_REG2REG \ using namespace AMDGPU; \ switch(Reg) { \ @@ -2250,16 +2267,13 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: @@ -2272,8 +2286,10 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.operands()[OpNo].OperandType; - return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && - OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST; + return (OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST) || + (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST); } // Avoid using MCRegisterClass::getSize, since that function will go away @@ -2423,10 +2439,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { return getRegBitWidth(RC.getID()); } -unsigned getRegBitWidth(const TargetRegisterClass &RC) { - return getRegBitWidth(RC.getID()); -} - unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo) { assert(OpNo < Desc.NumOperands); @@ -2522,6 +2534,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) { return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); } +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + return isInlinableLiteralV216(Literal, HasInv2Pi); + default: + return isInlinableIntLiteralV216(Literal); + } +} + bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); @@ -2535,6 +2557,13 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { return Lo16 == Hi16; } +bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { + if (IsFP64) + return !(Val & 0xffffffffu); + + return isUInt<32>(Val) || isInt<32>(Val); +} + bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -2552,13 +2581,15 @@ bool isArgPassedInSGPR(const Argument *A) { case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: // For non-compute shaders, SGPR inputs are marked with either inreg or // byval. Everything else is in VGPRs. return A->hasAttribute(Attribute::InReg) || A->hasAttribute(Attribute::ByVal); default: - // TODO: Should calls support inreg for SGPR inputs? - return false; + // TODO: treat i1 as divergent? + return A->hasAttribute(Attribute::InReg); } } @@ -2577,13 +2608,14 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) { case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_Gfx: + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: // For non-compute shaders, SGPR inputs are marked with either inreg or // byval. Everything else is in VGPRs. return CB->paramHasAttr(ArgNo, Attribute::InReg) || CB->paramHasAttr(ArgNo, Attribute::ByVal); default: - // TODO: Should calls support inreg for SGPR inputs? - return false; + return CB->paramHasAttr(ArgNo, Attribute::InReg); } } @@ -2597,6 +2629,9 @@ static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset) { + if (isGFX12Plus(ST)) + return isUInt<23>(EncodedOffset); + return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } @@ -2604,6 +2639,9 @@ bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset, bool IsBuffer) { + if (isGFX12Plus(ST)) + return isInt<24>(EncodedOffset); + return !IsBuffer && hasSMRDSignedImmOffset(ST) && isInt<21>(EncodedOffset); @@ -2624,6 +2662,10 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer) { + if (isGFX12Plus(ST)) // 24 bit signed offsets + return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset) + : std::nullopt; + // The signed version is always a byte offset. if (!IsBuffer && hasSMRDSignedImmOffset(ST)) { assert(hasSMEMByteOffset(ST)); @@ -2651,10 +2693,11 @@ std::optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, } unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST) { - // Address offset is 12-bit signed for GFX10, 13-bit for GFX9 and GFX11+. if (AMDGPU::isGFX10(ST)) return 12; + if (AMDGPU::isGFX12(ST)) + return 24; return 13; } @@ -2707,6 +2750,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, : getGfx9BufferFormatInfo(Format); } +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { + for (auto OpName : { OpName::vdst, OpName::src0, OpName::src1, + OpName::src2 }) { + int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName); + if (Idx == -1) + continue; + + if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID || + OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID) + return true; + } + + return false; +} + +bool isDPALU_DPP(const MCInstrDesc &OpDesc) { + return hasAny64BitVGPROperands(OpDesc); +} + } // namespace AMDGPU raw_ostream &operator<<(raw_ostream &OS, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index bdf7ccad9c76..3c9f330cbcde 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -31,7 +31,6 @@ class MCRegisterClass; class MCRegisterInfo; class MCSubtargetInfo; class StringRef; -class TargetRegisterClass; class Triple; class raw_ostream; @@ -43,30 +42,18 @@ namespace AMDGPU { struct IsaVersion; -enum { - AMDHSA_COV2 = 2, - AMDHSA_COV3 = 3, - AMDHSA_COV4 = 4, - AMDHSA_COV5 = 5 -}; +enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 }; +/// \returns True if \p STI is AMDHSA. +bool isHsaAbi(const MCSubtargetInfo &STI); /// \returns HSA OS ABI Version identification. std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 2, -/// false otherwise. -bool isHsaAbiVersion2(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 3, -/// false otherwise. -bool isHsaAbiVersion3(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 4, /// false otherwise. bool isHsaAbiVersion4(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 5, /// false otherwise. bool isHsaAbiVersion5(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 3 and above, -/// false otherwise. -bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI); /// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV); @@ -518,6 +505,10 @@ struct CanBeVOPD { bool Y; }; +/// \returns SIEncodingFamily used for VOPD encoding on a \p ST. +LLVM_READONLY +unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST); + LLVM_READONLY CanBeVOPD getCanBeVOPD(unsigned Opc); @@ -537,7 +528,7 @@ LLVM_READONLY unsigned getVOPDOpcode(unsigned Opc); LLVM_READONLY -int getVOPDFull(unsigned OpX, unsigned OpY); +int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily); LLVM_READONLY bool isVOPD(unsigned Opc); @@ -548,6 +539,9 @@ bool isMAC(unsigned Opc); LLVM_READNONE bool isPermlane16(unsigned Opc); +LLVM_READNONE +bool isGenericAtomic(unsigned Opc); + namespace VOPD { enum Component : unsigned { @@ -757,15 +751,20 @@ public: // GetRegIdx(Component, MCOperandIdx) must return a VGPR register index // for the specified component and MC operand. The callback must return 0 // if the operand is not a register or not a VGPR. - bool hasInvalidOperand( - std::function<unsigned(unsigned, unsigned)> GetRegIdx) const { - return getInvalidCompOperandIndex(GetRegIdx).has_value(); + // If \p SkipSrc is set to true then constraints for source operands are not + // checked. + bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx, + bool SkipSrc = false) const { + return getInvalidCompOperandIndex(GetRegIdx, SkipSrc).has_value(); } // Check VOPD operands constraints. // Return the index of an invalid component operand, if any. + // If \p SkipSrc is set to true then constraints for source operands are not + // checked. std::optional<unsigned> getInvalidCompOperandIndex( - std::function<unsigned(unsigned, unsigned)> GetRegIdx) const; + std::function<unsigned(unsigned, unsigned)> GetRegIdx, + bool SkipSrc = false) const; private: RegIndices @@ -1121,6 +1120,9 @@ bool isEntryFunctionCC(CallingConv::ID CC); LLVM_READNONE bool isModuleEntryFunctionCC(CallingConv::ID CC); +LLVM_READNONE +bool isChainCC(CallingConv::ID CC); + bool isKernelCC(const Function *Func); // FIXME: Remove this when calling conventions cleaned up @@ -1141,37 +1143,51 @@ bool hasMIMG_R128(const MCSubtargetInfo &STI); bool hasA16(const MCSubtargetInfo &STI); bool hasG16(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); -unsigned getNSAMaxSize(const MCSubtargetInfo &STI); +bool hasGDS(const MCSubtargetInfo &STI); +unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler = false); +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); bool isGFX9_GFX10(const MCSubtargetInfo &STI); +bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI); bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI); bool isGFX8Plus(const MCSubtargetInfo &STI); bool isGFX9Plus(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); +bool isGFX10_GFX11(const MCSubtargetInfo &STI); bool isGFX10Plus(const MCSubtargetInfo &STI); bool isNotGFX10Plus(const MCSubtargetInfo &STI); bool isGFX10Before1030(const MCSubtargetInfo &STI); bool isGFX11(const MCSubtargetInfo &STI); bool isGFX11Plus(const MCSubtargetInfo &STI); +bool isGFX12(const MCSubtargetInfo &STI); +bool isGFX12Plus(const MCSubtargetInfo &STI); +bool isNotGFX12Plus(const MCSubtargetInfo &STI); bool isNotGFX11Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); bool isGFX10_AEncoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); +bool isGFX10_3_GFX11(const MCSubtargetInfo &STI); bool isGFX90A(const MCSubtargetInfo &STI); bool isGFX940(const MCSubtargetInfo &STI); bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); bool hasMAIInsts(const MCSubtargetInfo &STI); bool hasVOPD(const MCSubtargetInfo &STI); +bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); +unsigned hasKernargPreload(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); +/// \returns if \p Reg occupies the high 16-bits of a 32-bit register. +/// The bit indicating isHi is the LSB of the encoding. +bool isHi(unsigned Reg, const MCRegisterInfo &MRI); + /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); @@ -1202,9 +1218,6 @@ unsigned getRegBitWidth(unsigned RCID); /// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); -/// Get the size in bits of a register from the register class \p RC. -unsigned getRegBitWidth(const TargetRegisterClass &RC); - /// Get size of register operand unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo); @@ -1225,6 +1238,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: // mandatory literal is always size 4 + case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: @@ -1283,8 +1297,14 @@ LLVM_READNONE bool isInlinableIntLiteralV216(int32_t Literal); LLVM_READNONE +bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); + +LLVM_READNONE bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isValid32BitLiteral(uint64_t Val, bool IsFP64); + bool isArgPassedInSGPR(const Argument *Arg); bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); @@ -1314,7 +1334,7 @@ std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, std::optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset); -/// For FLAT segment the offset must be positive; +/// For pre-GFX12 FLAT instructions the offset must be positive; /// MSB is ignored and forced to zero. /// /// \return The number of bits available for the signed offset field in flat @@ -1328,10 +1348,16 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST); bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); LLVM_READNONE -inline bool isLegal64BitDPPControl(unsigned DC) { +inline bool isLegalDPALU_DPPControl(unsigned DC) { return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; } +/// \returns true if an instruction may have a 64-bit VGPR operand. +bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); + +/// \returns true if an instruction is a DP ALU DPP. +bool isDPALU_DPP(const MCInstrDesc &OpDesc); + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index cbdbf1c16f9f..25e628e5cbc5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -74,6 +74,16 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_s_barrier_signal: + case Intrinsic::amdgcn_s_barrier_signal_var: + case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: + case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: + case Intrinsic::amdgcn_s_get_barrier_state: + case Intrinsic::amdgcn_s_wakeup_barrier: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: case Intrinsic::amdgcn_sched_group_barrier: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h index df37c420fa72..e42b27f8e09e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -9,19 +9,15 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H -#include <vector> - namespace llvm { struct Align; class AAResults; class DataLayout; -class Function; class GlobalVariable; class LoadInst; class MemoryDef; class MemorySSA; -class Module; class Value; namespace AMDGPU { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index a92d574b1848..0fa67c559cb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -18,7 +18,6 @@ #include "AMDGPUPTNote.h" #include "SIDefines.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -84,7 +83,6 @@ bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) { // Set PAL metadata from msgpack blob. bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) { - msgpack::Reader Reader(Blob); return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false); } @@ -242,30 +240,29 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { } // Set the stack frame size of a function in the metadata. -void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF, - unsigned Val) { - auto Node = getShaderFunction(MF.getFunction().getName()); +void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) { + auto Node = getShaderFunction(FnName); Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); + Node[".backend_stack_size"] = MsgPackDoc.getNode(Val); } // Set the amount of LDS used in bytes in the metadata. -void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF, - unsigned Val) { - auto Node = getShaderFunction(MF.getFunction().getName()); +void AMDGPUPALMetadata::setFunctionLdsSize(StringRef FnName, unsigned Val) { + auto Node = getShaderFunction(FnName); Node[".lds_size"] = MsgPackDoc.getNode(Val); } // Set the number of used vgprs in the metadata. -void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF, +void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName, unsigned Val) { - auto Node = getShaderFunction(MF.getFunction().getName()); + auto Node = getShaderFunction(FnName); Node[".vgpr_count"] = MsgPackDoc.getNode(Val); } // Set the number of used vgprs in the metadata. -void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF, +void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName, unsigned Val) { - auto Node = getShaderFunction(MF.getFunction().getName()); + auto Node = getShaderFunction(FnName); Node[".sgpr_count"] = MsgPackDoc.getNode(Val); } @@ -726,7 +723,7 @@ void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) { if (Registers.getMap().empty()) return; raw_string_ostream OS(Blob); - support::endian::Writer EW(OS, support::endianness::little); + support::endian::Writer EW(OS, llvm::endianness::little); for (auto I : Registers.getMap()) { EW.write(uint32_t(I.first.getUInt())); EW.write(uint32_t(I.second.getUInt())); @@ -911,6 +908,7 @@ void AMDGPUPALMetadata::reset() { MsgPackDoc.clear(); Registers = MsgPackDoc.getEmptyNode(); HwStages = MsgPackDoc.getEmptyNode(); + ShaderFunctions = MsgPackDoc.getEmptyNode(); } unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index e477904cb81f..158f766d0485 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -17,7 +17,6 @@ namespace llvm { -class MachineFunction; class Module; class StringRef; @@ -87,22 +86,22 @@ public: void setScratchSize(unsigned CC, unsigned Val); // Set the stack frame size of a function in the metadata. - void setFunctionScratchSize(const MachineFunction &MF, unsigned Val); + void setFunctionScratchSize(StringRef FnName, unsigned Val); // Set the amount of LDS used in bytes in the metadata. This is an optional // advisory record for logging etc; wave dispatch actually uses the rsrc1 // register for the shader stage to determine the amount of LDS to allocate. - void setFunctionLdsSize(const MachineFunction &MF, unsigned Val); + void setFunctionLdsSize(StringRef FnName, unsigned Val); // Set the number of used vgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of vgprs to allocate. - void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val); + void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val); // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. - void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val); + void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val); // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 7d03150bf5b1..fc563b7493ad 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -10,7 +10,7 @@ // VINTERP encoding //===----------------------------------------------------------------------===// -class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 { +class VINTERPe <VOPProfile P> : Enc64 { bits<8> vdst; bits<4> src0_modifiers; bits<9> src0; @@ -31,7 +31,6 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 { let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) let Inst{14} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3) let Inst{15} = clamp; - let Inst{22-16} = op; let Inst{40-32} = src0; let Inst{49-41} = src1; let Inst{58-50} = src2; @@ -40,6 +39,14 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 { let Inst{63} = src2_modifiers{0}; // neg(2) } +class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : VINTERPe<P> { + let Inst{22-16} = op; +} + +class VINTERPe_gfx12 <bits<7> op, VOPProfile P> : VINTERPe<P> { + let Inst{20-16} = op{4-0}; +} + //===----------------------------------------------------------------------===// // VOP3 VINTERP //===----------------------------------------------------------------------===// @@ -171,17 +178,28 @@ defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16, // VINTERP Real Instructions //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in { - multiclass VINTERP_Real_gfx11 <bits<7> op> { +multiclass VINTERP_Real_gfx11 <bits<7> op> { + let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { def _gfx11 : VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX11>, VINTERPe_gfx11<op, !cast<VOP3_Pseudo>(NAME).Pfl>; } } -defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>; -defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>; -defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>; -defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>; -defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>; -defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>; +multiclass VINTERP_Real_gfx12 <bits<7> op> { + let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in { + def _gfx12 : + VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX12>, + VINTERPe_gfx12<op, !cast<VOP3_Pseudo>(NAME).Pfl>; + } +} + +multiclass VINTERP_Real_gfx11_gfx12 <bits<7> op> : + VINTERP_Real_gfx11<op>, VINTERP_Real_gfx12<op>; + +defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>; +defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>; +defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>; +defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>; +defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>; +defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1a8efc6e3df2..27a7c29cb1ac 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -88,6 +88,12 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let TRANS = ps.TRANS; } +class VOP1_Real_Gen <VOP1_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> : + VOP1_Real <ps, Gen.Subtarget, real_name> { + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; +} + class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : VOP_SDWA_Pseudo <OpName, P, pattern> { let AsmMatchConverter = "cvtSdwaVOP1"; @@ -152,7 +158,7 @@ multiclass VOP1Inst_t16<string opName, defm NAME : VOP1Inst<opName, P, node>; } let OtherPredicates = [HasTrue16BitInsts] in { - defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>; + defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>; } } @@ -170,7 +176,7 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> : } class VOPProfileI2F_True16<ValueType dstVt, ValueType srcVt> : - VOPProfile_True16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> { + VOPProfile_Fake16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> { let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod); let InsVOP3Base = (ins Src0VOP3DPP:$src0, clampmod:$clamp, omod:$omod); @@ -199,7 +205,7 @@ class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> : def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>; def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>; def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>; -def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_True16<VOP_I16_F16> { +def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> { let HasOMod = 1; } @@ -221,7 +227,7 @@ def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOPProfile_MOV, null_frag, 0x8>; -let SubtargetPredicate = isGFX940Plus in +let SubtargetPredicate = isGFX940Plus, SchedRW = [Write64Bit] in defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>; } // End isMoveImm = 1 @@ -292,13 +298,13 @@ let FPDPRounding = 1, isReMaterializable = 0 in { let OtherPredicates = [NotHasTrue16BitInsts] in defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>; let OtherPredicates = [HasTrue16BitInsts] in - defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>; + defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_Fake16<VOP_F16_F32>, any_fpround>; } // End FPDPRounding = 1, isReMaterializable = 0 let OtherPredicates = [NotHasTrue16BitInsts] in defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>; let OtherPredicates = [HasTrue16BitInsts] in -defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>; +defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; @@ -317,7 +323,7 @@ defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f3 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>; -defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; +defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, froundeven>; defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; let TRANS = 1, SchedRW = [WriteTrans32] in { @@ -326,7 +332,7 @@ defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; -defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>; +defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans32] let TRANS = 1, SchedRW = [WriteTrans64] in { @@ -458,7 +464,7 @@ let SubtargetPredicate = isGFX7Plus in { let SchedRW = [WriteDoubleAdd] in { defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>; defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>; - defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>; + defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>; defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>; } // End SchedRW = [WriteDoubleAdd] } // End SubtargetPredicate = isGFX7Plus @@ -502,7 +508,7 @@ defm V_FREXP_EXP_I16_F16_t16 : VOP1Inst <"v_frexp_exp_i16_f16_t16", VOP_I16_F16_ defm V_FLOOR_F16 : VOP1Inst_t16 <"v_floor_f16", VOP_F16_F16, ffloor>; defm V_CEIL_F16 : VOP1Inst_t16 <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst_t16 <"v_trunc_f16", VOP_F16_F16, ftrunc>; -defm V_RNDNE_F16 : VOP1Inst_t16 <"v_rndne_f16", VOP_F16_F16, frint>; +defm V_RNDNE_F16 : VOP1Inst_t16 <"v_rndne_f16", VOP_F16_F16, froundeven>; let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; } // End FPDPRounding = 1 @@ -584,18 +590,28 @@ let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, } class Cvt_F32_F8_Pat<SDPatternOperator node, int index, - VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat< + VOP1_SDWA_Pseudo inst_sdwa> : GCNPat< (f32 (node i32:$src, index)), - !if (index, - (inst_sdwa 0, $src, 0, 0, index), - (inst_e32 $src)) + (inst_sdwa 0, $src, 0, 0, index) >; -foreach Index = [0, 1, 2, 3] in { - def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, - V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>; - def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, - V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>; +let OtherPredicates = [HasCvtFP8VOP1Bug] in { + def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), + (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; + def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), + (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>; +} + +let OtherPredicates = [HasNoCvtFP8VOP1Bug] in { + def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), + (V_CVT_F32_FP8_e32 $src)>; + def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)), + (V_CVT_F32_BF8_e32 $src)>; +} + +foreach Index = [1, 2, 3] in { + def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>; + def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>; } class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index, @@ -646,6 +662,7 @@ let SubtargetPredicate = isGFX11Plus in { getVOP1Pat64<int_amdgcn_permlane64, VOP_MOVRELS>.ret, /*VOP1Only=*/ 1>; + defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>; @@ -677,6 +694,13 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p let SubtargetPredicate = HasDPP16; } +class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : + VOP1_DPP16 <op, ps, Gen.Subtarget, p> { + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = "DPP"#Gen.DecoderNamespace; +} + + class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : VOP_DPP8<ps.OpName, p> { let hasSideEffects = ps.hasSideEffects; @@ -691,137 +715,173 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : let Inst{31-25} = 0x3f; } +class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : + VOP1_DPP8<op, ps, p> { + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; +} + //===----------------------------------------------------------------------===// -// GFX11. +// GFX11, GFX12 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { - multiclass VOP1Only_Real_gfx11<bits<9> op> { - let IsSingle = 1 in - def _gfx11 : - VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>, - VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; - } - multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - def _e32_gfx11 : - VOP1_Real<ps, SIEncodingFamily.GFX11>, - VOP1e<op{7-0}, ps.Pfl>; - } - multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName, - string asmName> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - let AsmString = asmName # ps.AsmOperands in { - defm NAME : VOP1_Real_e32_gfx11<op, opName>; - } - } - multiclass VOP1_Real_e64_gfx11<bits<9> op> { - def _e64_gfx11 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>, - VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; - } - multiclass VOP1_Real_dpp_gfx11<bits<9> op, string opName = NAME> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - def _dpp_gfx11 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> { - let DecoderNamespace = "DPPGFX11"; - } - } - multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName, - string asmName> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in { - defm NAME : VOP1_Real_dpp_gfx11<op, opName>; - } +multiclass VOP1Only_Real<GFXGen Gen, bits<9> op> { + let IsSingle = 1 in + def Gen.Suffix : + VOP1_Real_Gen<!cast<VOP1_Pseudo>(NAME), Gen>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; +} + +multiclass VOP1_Real_e32<GFXGen Gen, bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _e32#Gen.Suffix : + VOP1_Real_Gen<ps, Gen>, + VOP1e<op{7-0}, ps.Pfl>; +} + +multiclass VOP1_Real_e32_with_name<GFXGen Gen, bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.AsmOperands in { + defm NAME : VOP1_Real_e32<Gen, op, opName>; } - multiclass VOP1_Real_dpp8_gfx11<bits<9> op, string opName = NAME> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - def _dpp8_gfx11 : VOP1_DPP8<op{7-0}, ps> { - let DecoderNamespace = "DPP8GFX11"; - } +} + +multiclass VOP1_Real_e64<GFXGen Gen, bits<9> op> { + def _e64#Gen.Suffix : + VOP3_Real_Gen<!cast<VOP3_Pseudo>(NAME#"_e64"), Gen>, + VOP3e_gfx11_gfx12<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +multiclass VOP1_Real_dpp<GFXGen Gen, bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _dpp#Gen.Suffix : VOP1_DPP16_Gen<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), Gen>; +} + +multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP16 in { + defm NAME : VOP1_Real_dpp<Gen, op, opName>; } - multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName, - string asmName> { - defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); - let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in { - defm NAME : VOP1_Real_dpp8_gfx11<op, opName>; - } +} + +multiclass VOP1_Real_dpp8<GFXGen Gen, bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _dpp8#Gen.Suffix : VOP1_DPP8_Gen<op{7-0}, ps, Gen>; +} + +multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP8 in { + defm NAME : VOP1_Real_dpp8<Gen, op, opName>; } -} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" +} -multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> { - defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>; +multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> { + defm NAME : VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>; } -multiclass VOP1_Realtriple_e64_with_name_gfx11<bits<9> op, string opName, + +multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> { - defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 1, op{6-0}}, opName, + defm NAME : VOP3_Realtriple_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>; } -multiclass VOP1_Real_FULL_gfx11<bits<9> op> : - VOP1_Real_e32_gfx11<op>, VOP1_Realtriple_e64_gfx11<op>, - VOP1_Real_dpp_gfx11<op>, VOP1_Real_dpp8_gfx11<op>; +multiclass VOP1_Real_FULL<GFXGen Gen, bits<9> op> : + VOP1_Real_e32<Gen, op>, VOP1_Realtriple_e64<Gen, op>, + VOP1_Real_dpp<Gen, op>, VOP1_Real_dpp8<Gen, op>; multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName, - string asmName> { - defm NAME : VOP1_Real_e32_with_name_gfx11<op, opName, asmName>, - VOP1_Real_dpp_with_name_gfx11<op, opName, asmName>, - VOP1_Real_dpp8_with_name_gfx11<op, opName, asmName>; + string asmName> { + defm NAME : VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>, + VOP1_Real_dpp_with_name<GFX11Gen, op, opName, asmName>, + VOP1_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>; defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); def gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; } -multiclass VOP1_Real_FULL_with_name_gfx11<bits<9> op, string opName, +multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName, + string asmName> { + defm NAME : VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_dpp_with_name<GFX12Gen, op, opName, asmName>, + VOP1_Real_dpp8_with_name<GFX12Gen, op, opName, asmName>; +} + +multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName, string asmName> : - VOP1_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>, - VOP1_Realtriple_e64_with_name_gfx11<op, opName, asmName>; + VOP1_Real_e32_with_name<Gen, op, opName, asmName>, + VOP1_Real_dpp_with_name<Gen, op, opName, asmName>, + VOP1_Real_dpp8_with_name<Gen, op, opName, asmName>, + VOP1_Realtriple_e64_with_name<Gen, op, opName, asmName>; -multiclass VOP1_Real_FULL_t16_gfx11<bits<9> op, string asmName, - string opName = NAME> : - VOP1_Real_FULL_with_name_gfx11<op, opName, asmName>; +multiclass VOP1_Real_NO_DPP<GFXGen Gen, bits<9> op> : + VOP1_Real_e32<Gen, op>, VOP1_Real_e64<Gen, op>; -multiclass VOP1_Real_NO_DPP_gfx11<bits<9> op> : - VOP1_Real_e32_gfx11<op>, VOP1_Real_e64_gfx11<op>; +multiclass VOP1_Real_FULL_t16_gfx11_gfx12<bits<9> op, string asmName, + string opName = NAME> : + VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; -defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00c, +multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName, + string asmName> : + VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, + VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> : + VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>; + +multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> : + VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; + +multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op, + string opName, string asmName> : + VOP1_Real_e32_with_name<Gen, op, opName, asmName>, + VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>; + + +defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; -defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00d, +defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">; -defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11<0x039, +defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x039, "V_FFBH_U32", "v_clz_i32_u32">; -defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a, +defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a, "V_FFBL_B32", "v_ctz_i32_b32">; -defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b, +defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b, "V_FFBH_I32", "v_cls_i32">; -defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>; -defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">; -defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">; -defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">; - -defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x050, "v_cvt_f16_u16">; -defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x051, "v_cvt_f16_i16">; -defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x052, "v_cvt_u16_f16">; -defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x053, "v_cvt_i16_f16">; -defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x054, "v_rcp_f16">; -defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x055, "v_sqrt_f16">; -defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x056, "v_rsq_f16">; -defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x057, "v_log_f16">; -defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x058, "v_exp_f16">; -defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x059, "v_frexp_mant_f16">; -defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05a, "v_frexp_exp_i16_f16">; -defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05b, "v_floor_f16">; -defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05c, "v_ceil_f16">; -defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05d, "v_trunc_f16">; -defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05e, "v_rndne_f16">; -defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05f, "v_fract_f16">; -defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x060, "v_sin_f16">; -defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x061, "v_cos_f16">; -defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x062, "v_sat_pk_u8_i16">; -defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x063, "v_cvt_norm_i16_f16">; -defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x064, "v_cvt_norm_u16_f16">; - -defm V_CVT_F16_F32_t16 : VOP1_Real_FULL_t16_gfx11<0x00a, "v_cvt_f16_f32">; -defm V_CVT_F32_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x00b, "v_cvt_f32_f16">; +defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>; +defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">; +defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">; +defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">; +defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">; + +defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">; +defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">; +defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">; +defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">; +defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">; +defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">; +defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">; +defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">; +defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">; +defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">; +defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">; +defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">; +defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">; +defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">; +defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">; +defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">; +defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">; +defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">; +defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">; +defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">; +defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">; + +defm V_CVT_F16_F32_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">; +defm V_CVT_F32_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">; //===----------------------------------------------------------------------===// // GFX10. @@ -870,17 +930,23 @@ multiclass VOP1_Real_gfx10<bits<9> op> : VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>, VOP1_Real_dpp8_gfx10<op>; -multiclass VOP1_Real_gfx10_FULL_gfx11<bits<9> op> : - VOP1_Real_gfx10<op>, VOP1_Real_FULL_gfx11<op>; +multiclass VOP1_Real_gfx10_FULL_gfx11_gfx12<bits<9> op> : + VOP1_Real_gfx10<op>, + VOP1_Real_FULL<GFX11Gen, op>, + VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_gfx10_NO_DPP_gfx11<bits<9> op> : - VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; +multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> : + VOP1_Real_gfx10<op>, + VOP1_Real_NO_DPP<GFX11Gen, op>, + VOP1_Real_NO_DPP<GFX12Gen, op>; -multiclass VOP1Only_Real_gfx10_gfx11<bits<9> op> : - VOP1Only_Real_gfx10<op>, VOP1Only_Real_gfx11<op>; +multiclass VOP1Only_Real_gfx10_gfx11_gfx12<bits<9> op> : + VOP1Only_Real_gfx10<op>, + VOP1Only_Real<GFX11Gen, op>, + VOP1Only_Real<GFX12Gen, op>; -defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11<0x01b>; -defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11<0x048>; +defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<0x01b>; +defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11_gfx12<0x048>; defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>; defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>; defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>; @@ -903,11 +969,11 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>; defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>; defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>; -defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11<0x065>; -defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11<0x068>; +defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x065>; +defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x068>; //===----------------------------------------------------------------------===// -// GFX7, GFX10. +// GFX7, GFX10, GFX11, GFX12 //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { @@ -926,22 +992,20 @@ let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { multiclass VOP1_Real_gfx7<bits<9> op> : VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>; -multiclass VOP1_Real_gfx7_gfx10<bits<9> op> : - VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>; - -multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> : - VOP1_Real_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; +multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> : + VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, + VOP1_Real_NO_DPP<GFX12Gen, op>; defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>; defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>; -defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x017>; -defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x018>; -defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x019>; -defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x01a>; +defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x017>; +defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x018>; +defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x019>; +defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x01a>; //===----------------------------------------------------------------------===// -// GFX6, GFX7, GFX10, GFX11. +// GFX6, GFX7, GFX10, GFX11, GFX12 //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { @@ -963,11 +1027,13 @@ multiclass VOP1_Real_gfx6_gfx7<bits<9> op> : multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> : VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>; -multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<bits<9> op> : - VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL_gfx11<op>; +multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<bits<9> op> : + VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL<GFX11Gen, op>, + VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> : - VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; +multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> : + VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>, + VOP1_Real_NO_DPP<GFX12Gen, op>; defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; @@ -977,57 +1043,57 @@ defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; -defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x000>; -defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x001>; -defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x003>; -defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x004>; -defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x005>; -defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x006>; -defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x007>; -defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x008>; +defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x000>; +defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x001>; +defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x003>; +defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x004>; +defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x005>; +defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x006>; +defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x007>; +defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x008>; defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>; defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>; defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>; defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>; -defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00e>; -defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x00f>; -defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x010>; -defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x011>; -defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x012>; -defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x013>; -defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x014>; -defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x015>; -defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x016>; -defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x020>; -defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x021>; -defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x022>; -defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x023>; -defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x024>; -defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x025>; -defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x027>; -defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02a>; -defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02b>; -defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02e>; -defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x02f>; -defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x031>; -defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x033>; -defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x034>; -defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x035>; -defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x036>; -defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x037>; -defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x038>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x00e>; +defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x00f>; +defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x010>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x011>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x012>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x013>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x014>; +defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x015>; +defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x016>; +defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x020>; +defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x021>; +defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x022>; +defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x023>; +defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x024>; +defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x025>; +defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x027>; +defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02b>; +defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02e>; +defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x02f>; +defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x031>; +defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x033>; +defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x034>; +defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x035>; +defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x036>; +defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x037>; +defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x038>; defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>; defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>; defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>; -defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03c>; -defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03d>; -defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03e>; -defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x03f>; -defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x040>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03c>; +defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03d>; +defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x03f>; +defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x040>; defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; -defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x042>; -defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x043>; -defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x044>; +defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x042>; +defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x043>; +defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x044>; //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). @@ -1163,7 +1229,7 @@ defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>; defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>; -let VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [EXEC, M0] in { +let VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [EXEC, M0], Size = V_MOV_B32_e32.Size in { // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, @@ -1193,8 +1259,8 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; -def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, +class UpdateDPPPat<ValueType vt> : GCNPat < + (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl), @@ -1202,6 +1268,11 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; +def : UpdateDPPPat<i32>; +def : UpdateDPPPat<f32>; +def : UpdateDPPPat<v2i16>; +def : UpdateDPPPat<v2f16>; + } // End OtherPredicates = [isGFX8Plus] let OtherPredicates = [isGFX8Plus] in { @@ -1303,3 +1374,15 @@ def : GCNPat < (as_i32timm $dpp8), (i32 DPP8Mode.FI_0)) >; } // End OtherPredicates = [isGFX11Only] + +//===----------------------------------------------------------------------===// +// GFX12 +//===----------------------------------------------------------------------===// + +let OtherPredicates = [isGFX12Only] in { +def : GCNPat < + (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), + (V_MOV_B32_dpp8_gfx12 VGPR_32:$src, VGPR_32:$src, + (as_i32timm $dpp8), (i32 DPP8Mode.FI_0)) +>; +} // End OtherPredicates = [isGFX12Only] diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 481a162748e6..0aa62ea77b11 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -109,6 +109,14 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo let mayStore = ps.mayStore; } +class VOP2_Real_Gen <VOP2_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> : + VOP2_Real <ps, Gen.Subtarget, real_name> { + let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate); + let DecoderNamespace = Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); +} + class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : VOP_SDWA_Pseudo <OpName, P, pattern> { let AsmMatchConverter = "cvtSdwaVOP2"; @@ -194,9 +202,12 @@ multiclass VOP2Inst_t16<string opName, let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in { defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>; } - let SubtargetPredicate = HasTrue16BitInsts in { + let SubtargetPredicate = UseRealTrue16Insts in { defm _t16 : VOP2Inst<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16", GFX9Renamed>; } + let SubtargetPredicate = UseFakeTrue16Insts in { + defm _fake16 : VOP2Inst<opName#"_fake16", VOPProfile_Fake16<P>, node, revOp#"_fake16", GFX9Renamed>; + } } // Creating a _t16_e32 pseudo when there is no corresponding real instruction on @@ -212,7 +223,7 @@ multiclass VOP2Inst_e64_t16<string opName, defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>; } let SubtargetPredicate = HasTrue16BitInsts in { - defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16", GFX9Renamed>; + defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_Fake16<P>, node, revOp#"_t16", GFX9Renamed>; } } @@ -378,7 +389,7 @@ def VOP_MADAK_F16 : VOP_MADAK <f16>; def VOP_MADAK_F16_t16 : VOP_MADAK <f16> { let IsTrue16 = 1; let DstRC = VOPDstOperand<VGPR_32_Lo128>; - let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm); + let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm); } def VOP_MADAK_F32 : VOP_MADAK <f32>; @@ -403,7 +414,7 @@ def VOP_MADMK_F16 : VOP_MADMK <f16>; def VOP_MADMK_F16_t16 : VOP_MADMK <f16> { let IsTrue16 = 1; let DstRC = VOPDstOperand<VGPR_32_Lo128>; - let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1); + let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1); } def VOP_MADMK_F32 : VOP_MADMK <f32>; @@ -859,6 +870,17 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>; def : divergent_i64_BinOp <or, V_OR_B32_e64>; def : divergent_i64_BinOp <xor, V_XOR_B32_e64>; +// mul24 w/ 64 bit output. +class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat< + (i64 (Op i32:$src0, i32:$src1)), + (REG_SEQUENCE VReg_64, + (InstLo $src0, $src1), sub0, + (InstHi $src0, $src1), sub1) +>; + +def : mul24_64_Pat<AMDGPUmul_i24, V_MUL_I32_I24_e64, V_MUL_HI_I32_I24_e64>; +def : mul24_64_Pat<AMDGPUmul_u24, V_MUL_U32_U24_e64, V_MUL_HI_U32_U24_e64>; + //===----------------------------------------------------------------------===// // 16-Bit Operand Instructions //===----------------------------------------------------------------------===// @@ -874,7 +896,7 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> { let HasSrc1FloatMods = 0; let Src1ModSDWA = Int16SDWAInputMods; } -def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> { +def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> { let Src1RC32 = RegisterOperand<VGPR_32_Lo128>; let Src1DPP = VGPR_32_Lo128; let Src1ModDPP = IntT16VRegInputMods; @@ -925,13 +947,13 @@ def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>; let SubtargetPredicate = isGFX11Plus in { let isCommutable = 1 in { - defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>; - defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, or>; - defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, xor>; + defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>; + defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>; + defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>; } // End isCommutable = 1 } // End SubtargetPredicate = isGFX11Plus -let FPDPRounding = 1, isReMaterializable = 1 in { +let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in { let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in { def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; } @@ -947,7 +969,7 @@ let SubtargetPredicate = HasTrue16BitInsts in { def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">; } } // End isCommutable = 1 -} // End FPDPRounding = 1, isReMaterializable = 1 +} // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 let Constraints = "$vdst = $src2", DisableEncoding="$src2", @@ -1089,12 +1111,12 @@ let AddedComplexity = 30 in { } } // End AddedComplexity = 30 -let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in { +let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1 in { def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">; let isCommutable = 1 in def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">; -} +} // End SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1 let SubtargetPredicate = HasPkFmacF16Inst in { defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>; @@ -1201,6 +1223,20 @@ def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>; def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>; } +let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in { + let SchedRW = [WriteDoubleAdd], isCommutable = 1 in { + let FPDPRounding = 1 in { + defm V_ADD_F64_pseudo : VOP2Inst <"v_add_f64_pseudo", VOP_F64_F64_F64, any_fadd>; + defm V_MUL_F64_pseudo : VOP2Inst <"v_mul_f64_pseudo", VOP_F64_F64_F64, fmul>; + } // End FPDPRounding = 1 + defm V_MIN_NUM_F64 : VOP2Inst <"v_min_num_f64", VOP_F64_F64_F64, fminnum_like>; + defm V_MAX_NUM_F64 : VOP2Inst <"v_max_num_f64", VOP_F64_F64_F64, fmaxnum_like>; + } // End SchedRW = [WriteDoubleAdd], isCommutable = 1 + let SchedRW = [Write64Bit] in { + defm V_LSHLREV_B64_pseudo : VOP2Inst <"v_lshlrev_b64_pseudo", VOP_I64_I32_I64, clshl_rev_64>; + } // End SchedRW = [Write64Bit] +} // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 + //===----------------------------------------------------------------------===// // DPP Encodings //===----------------------------------------------------------------------===// @@ -1236,6 +1272,15 @@ class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget, Base_VOP2_DPP16<op, ps, opName, p>, SIMCInstr <ps.PseudoInstr, subtarget>; +class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen, + string opName = ps.OpName, VOPProfile p = ps.Pfl> : + VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> { + let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate); + let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); +} + class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, VOPProfile p = ps.Pfl> : VOP_DPP8<ps.OpName, p> { @@ -1255,230 +1300,362 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, let OtherPredicates = ps.OtherPredicates; } + +class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen, + VOPProfile p = ps.Pfl> : + VOP2_DPP8<op, ps, p> { + let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate); + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); +} //===----------------------------------------------------------------------===// -// GFX11. +// GFX11, GFX12 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { - //===------------------------------- VOP2 -------------------------------===// - multiclass VOP2Only_Real_MADK_gfx11<bits<6> op> { - def _gfx11 : - VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX11>, - VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +//===------------------------------- VOP2 -------------------------------===// +multiclass VOP2Only_Real_MADK<GFXGen Gen, bits<6> op> { + def Gen.Suffix : + VOP2_Real_Gen<!cast<VOP2_Pseudo>(NAME), Gen>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; +} + +multiclass VOP2Only_Real_MADK_with_name<GFXGen Gen, bits<6> op, string asmName, + string opName = NAME> { + def Gen.Suffix : + VOP2_Real_Gen<!cast<VOP2_Pseudo>(opName), Gen>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName); + let AsmString = asmName # ps.AsmOperands; } - multiclass VOP2Only_Real_MADK_gfx11_with_name<bits<6> op, string asmName, - string opName = NAME> { - def _gfx11 : - VOP2_Real<!cast<VOP2_Pseudo>(opName), SIEncodingFamily.GFX11>, - VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> { - VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName); +} + +multiclass VOP2_Real_e32<GFXGen Gen, bits<6> op> { + def _e32#Gen.Suffix : + VOP2_Real_Gen<!cast<VOP2_Pseudo>(NAME#"_e32"), Gen>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; +} + +multiclass VOP2Only_Real_e32<GFXGen Gen, bits<6> op> { + let IsSingle = 1 in + defm NAME: VOP2_Real_e32<Gen, op>; +} + +multiclass VOP2_Real_e64<GFXGen Gen, bits<6> op> { + def _e64#Gen.Suffix : + VOP3_Real_Gen<!cast<VOP3_Pseudo>(NAME#"_e64"), Gen>, + VOP3e_gfx11_gfx12<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; +} + +multiclass VOP2_Real_dpp<GFXGen Gen, bits<6> op> { + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp#Gen.Suffix : VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), Gen>; +} + +multiclass VOP2_Real_dpp8<GFXGen Gen, bits<6> op> { + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then + def _dpp8#Gen.Suffix : VOP2_DPP8_Gen<op, !cast<VOP2_Pseudo>(NAME#"_e32"), Gen>; +} + +//===------------------------- VOP2 (with name) -------------------------===// +multiclass VOP2_Real_e32_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName, bit single = 0> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def _e32#Gen.Suffix : + VOP2_Real_Gen<ps, Gen, asmName>, + VOP2e<op{5-0}, ps.Pfl> { let AsmString = asmName # ps.AsmOperands; + let IsSingle = single; } - } - multiclass VOP2_Real_e32_gfx11<bits<6> op> { - def _e32_gfx11 : - VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX11>, - VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; - } - multiclass VOP2Only_Real_e32_gfx11<bits<6> op> { - let IsSingle = 1 in - defm NAME: VOP2_Real_e32_gfx11<op>; - } - multiclass VOP2_Real_e64_gfx11<bits<6> op> { - def _e64_gfx11 : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>, - VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; - } - multiclass VOP2_Real_dpp_gfx11<bits<6> op> { - if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> { - let DecoderNamespace = "DPPGFX11"; - } - } - multiclass VOP2_Real_dpp8_gfx11<bits<6> op> { - if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then - def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "DPP8GFX11"; +} +multiclass VOP2_Real_e64_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3e_gfx11_gfx12<{0, 1, 0, 0, op{5-0}}, ps.Pfl> { + let AsmString = asmName # ps.AsmOperands; } - } +} - //===------------------------- VOP2 (with name) -------------------------===// - multiclass VOP2_Real_e32_with_name_gfx11<bits<6> op, string opName, - string asmName, bit single = 0> { - defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - def _e32_gfx11 : - VOP2_Real<ps, SIEncodingFamily.GFX11, asmName>, - VOP2e<op{5-0}, ps.Pfl> { - let AsmString = asmName # ps.AsmOperands; - let IsSingle = single; - } +multiclass VOP2_Real_dpp_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + if ps.Pfl.HasExtDPP then + def _dpp#Gen.Suffix : VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), Gen> { + let AsmString = asmName # ps.Pfl.AsmDPP16; } - multiclass VOP2_Real_e64_with_name_gfx11<bits<6> op, string opName, - string asmName> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, ps.Pfl> { - let AsmString = asmName # ps.AsmOperands; - } +} +multiclass VOP2_Real_dpp8_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + if ps.Pfl.HasExtDPP then + def _dpp8#Gen.Suffix : VOP2_DPP8_Gen<op, ps, Gen> { + let AsmString = asmName # ps.Pfl.AsmDPP8; } +} - multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName, - string asmName> { - defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - if ps.Pfl.HasExtDPP then - def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), - SIEncodingFamily.GFX11> { - let AsmString = asmName # ps.Pfl.AsmDPP16; - let DecoderNamespace = "DPPGFX11"; +//===------------------------------ VOP2be ------------------------------===// +multiclass VOP2be_Real_e32<GFXGen Gen, bits<6> op, string opName, string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def _e32#Gen.Suffix : + VOP2_Real_Gen<ps, Gen>, + VOP2e<op{5-0}, ps.Pfl> { + let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands); } - } - multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName, - string asmName> { - defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - if ps.Pfl.HasExtDPP then - def _dpp8_gfx11 : VOP2_DPP8<op, ps> { - let AsmString = asmName # ps.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8GFX11"; +} +multiclass VOP2be_Real_dpp<GFXGen Gen, bits<6> op, string opName, string asmName> { + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp#Gen.Suffix : + VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), Gen, asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst(", vcc", "", AsmDPP); } - } - - //===------------------------------ VOP2be ------------------------------===// - multiclass VOP2be_Real_e32_gfx11<bits<6> op, string opName, string asmName> { - defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - def _e32_gfx11 : - VOP2_Real<ps, SIEncodingFamily.GFX11>, - VOP2e<op{5-0}, ps.Pfl> { - let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands); - } - } - multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> { - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp_gfx11 : - VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # !subst(", vcc", "", AsmDPP); - let DecoderNamespace = "DPPGFX11"; - } - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp_w32_gfx11 : - Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp_w64_gfx11 : - Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { - string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; - let AsmString = asmName # AsmDPP; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } - } - multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> { - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp8_gfx11 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # !subst(", vcc", "", AsmDPP8); - let DecoderNamespace = "DPP8GFX11"; - } - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp8_w32_gfx11 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave32; - } - if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then - def _dpp8_w64_gfx11 : - VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { - string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; - let AsmString = asmName # AsmDPP8; - let isAsmParserOnly = 1; - let WaveSizePredicate = isWave64; - } - } - -} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp_w32#Gen.Suffix : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; + } + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp_w64#Gen.Suffix : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; + } +} +multiclass VOP2be_Real_dpp8<GFXGen Gen, bits<6> op, string opName, string asmName> { + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp8#Gen.Suffix : + VOP2_DPP8_Gen<op, !cast<VOP2_Pseudo>(opName#"_e32"), Gen> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst(", vcc", "", AsmDPP8); + } + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp8_w32#Gen.Suffix : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; + } + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then + def _dpp8_w64#Gen.Suffix : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; + } +} // We don't want to override separate decoderNamespaces within these -multiclass VOP2_Realtriple_e64_gfx11<bits<6> op> { - defm NAME : VOP3_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ; +multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> { + defm NAME : VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ; } -multiclass VOP2_Realtriple_e64_with_name_gfx11<bits<6> op, string opName, + +multiclass VOP2_Realtriple_e64_with_name<GFXGen Gen, bits<6> op, string opName, string asmName> { - defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 0, 0, op{5-0}}, opName, asmName> ; + defm NAME : VOP3_Realtriple_with_name<Gen, {0, 1, 0, 0, op{5-0}}, opName, asmName> ; } -multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> : - VOP2be_Real_e32_gfx11<op, opName, asmName>, - VOP3be_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>, - VOP2be_Real_dpp_gfx11<op, opName, asmName>, - VOP2be_Real_dpp8_gfx11<op, opName, asmName>; +multiclass VOP2be_Real<GFXGen Gen, bits<6> op, string opName, string asmName> : + VOP2be_Real_e32<Gen, op, opName, asmName>, + VOP3be_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>, + VOP2be_Real_dpp<Gen, op, opName, asmName>, + VOP2be_Real_dpp8<Gen, op, opName, asmName>; // Only for CNDMASK -multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> : - VOP2_Real_e32_gfx11<op>, - VOP2_Realtriple_e64_gfx11<op>, - VOP2be_Real_dpp_gfx11<op, opName, asmName>, - VOP2be_Real_dpp8_gfx11<op, opName, asmName>; +multiclass VOP2e_Real<GFXGen Gen, bits<6> op, string opName, string asmName> : + VOP2_Real_e32<Gen, op>, + VOP2_Realtriple_e64<Gen, op>, + VOP2be_Real_dpp<Gen, op, opName, asmName>, + VOP2be_Real_dpp8<Gen, op, opName, asmName>; + +multiclass VOP2Only_Real<GFXGen Gen, bits<6> op> : + VOP2Only_Real_e32<Gen, op>, + VOP2_Real_dpp<Gen, op>, + VOP2_Real_dpp8<Gen, op>; + +multiclass VOP2_Real_FULL<GFXGen Gen, bits<6> op> : + VOP2_Realtriple_e64<Gen, op>, + VOP2_Real_e32<Gen, op>, + VOP2_Real_dpp<Gen, op>, + VOP2_Real_dpp8<Gen, op>; + +multiclass VOP2_Real_NO_VOP3_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName, bit isSingle = 0> { + defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName, isSingle>, + VOP2_Real_dpp_with_name<Gen, op, opName, asmName>, + VOP2_Real_dpp8_with_name<Gen, op, opName, asmName>; + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>; +} -multiclass VOP2Only_Real_gfx11<bits<6> op> : - VOP2Only_Real_e32_gfx11<op>, - VOP2_Real_dpp_gfx11<op>, - VOP2_Real_dpp8_gfx11<op>; +multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName> : + VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>, + VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>; -multiclass VOP2_Real_NO_VOP3_gfx11<bits<6> op> : - VOP2_Real_e32_gfx11<op>, VOP2_Real_dpp_gfx11<op>, VOP2_Real_dpp8_gfx11<op>; +multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName, + string asmName> { + defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>, + VOP2_Real_e64_with_name<Gen, op, opName, asmName>; + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>; +} -multiclass VOP2_Real_FULL_gfx11<bits<6> op> : - VOP2_Realtriple_e64_gfx11<op>, VOP2_Real_NO_VOP3_gfx11<op>; +multiclass VOP2_Real_NO_DPP_with_alias<GFXGen Gen, bits<6> op, string alias> { + defm NAME : VOP2_Real_e32<Gen, op>, + VOP2_Real_e64<Gen, op>; + def Gen.Suffix#"_alias" : MnemonicAlias<alias, NAME>, Requires<[Gen.AssemblerPredicate]>; +} -multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName, - string asmName, bit isSingle = 0> { +//===----------------------------------------------------------------------===// +// GFX12. +//===----------------------------------------------------------------------===// - defm NAME : VOP2_Real_e32_with_name_gfx11<op, opName, asmName, isSingle>, - VOP2_Real_dpp_with_name_gfx11<op, opName, asmName>, - VOP2_Real_dpp8_with_name_gfx11<op, opName, asmName>; - defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; +multiclass VOP2be_Real_gfx12<bits<6> op, string opName, string asmName> : + VOP2be_Real<GFX12Gen, op, opName, asmName>; + +// Only for CNDMASK +multiclass VOP2e_Real_gfx12<bits<6> op, string opName, string asmName> : + VOP2e_Real<GFX12Gen, op, opName, asmName>; + +multiclass VOP2_Real_FULL_with_name_gfx12<bits<6> op, string opName, + string asmName> : + VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP2_Real_FULL_t16_with_name_gfx12<bits<6> op, string opName, + string asmName, string alias> { + defm NAME : VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + def _gfx12_2nd_alias : MnemonicAlias<alias, asmName>, Requires<[isGFX12Only]>; } -multiclass VOP2_Real_FULL_with_name_gfx11<bits<6> op, string opName, - string asmName> : - VOP2_Realtriple_e64_with_name_gfx11<op, opName, asmName>, - VOP2_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>; +multiclass VOP2_Real_NO_DPP_with_name_gfx12<bits<6> op, string opName, + string asmName> : + VOP2_Real_NO_DPP_with_name<GFX12Gen, op, opName, asmName>; -multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName, string opName = NAME> - : VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>; +multiclass VOP2_Real_NO_DPP_with_alias_gfx12<bits<6> op, string alias> : + VOP2_Real_NO_DPP_with_alias<GFX12Gen, op, alias>; -multiclass VOP2_Real_NO_DPP_gfx11<bits<6> op> : - VOP2_Real_e32_gfx11<op>, VOP2_Real_e64_gfx11<op>; +defm V_ADD_F64 : VOP2_Real_NO_DPP_with_name_gfx12<0x002, "V_ADD_F64_pseudo", "v_add_f64">; +defm V_MUL_F64 : VOP2_Real_NO_DPP_with_name_gfx12<0x006, "V_MUL_F64_pseudo", "v_mul_f64">; +defm V_LSHLREV_B64 : VOP2_Real_NO_DPP_with_name_gfx12<0x01f, "V_LSHLREV_B64_pseudo", "v_lshlrev_b64">; +defm V_MIN_NUM_F64 : VOP2_Real_NO_DPP_with_alias_gfx12<0x00d, "v_min_f64">; +defm V_MAX_NUM_F64 : VOP2_Real_NO_DPP_with_alias_gfx12<0x00e, "v_max_f64">; -multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName, - string asmName> { - defm NAME : VOP2_Real_e32_with_name_gfx11<op, opName, asmName>, - VOP2_Real_e64_with_name_gfx11<op, opName, asmName>; +defm V_CNDMASK_B32 : VOP2e_Real_gfx12<0x001, "V_CNDMASK_B32", "v_cndmask_b32">; +defm V_ADD_CO_CI_U32 : + VOP2be_Real_gfx12<0x020, "V_ADDC_U32", "v_add_co_ci_u32">; +defm V_SUB_CO_CI_U32 : + VOP2be_Real_gfx12<0x021, "V_SUBB_U32", "v_sub_co_ci_u32">; +defm V_SUBREV_CO_CI_U32 : + VOP2be_Real_gfx12<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; + +defm V_MIN_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x015, "V_MIN_F32", "v_min_num_f32">; +defm V_MAX_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x016, "V_MAX_F32", "v_max_num_f32">; +defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_t16", "v_min_num_f16", "v_min_f16">; +defm V_MIN_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_fake16", "v_min_num_f16", "v_min_f16">; +defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_t16", "v_max_num_f16", "v_max_f16">; +defm V_MAX_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_fake16", "v_max_num_f16", "v_max_f16">; + +let SubtargetPredicate = isGFX12Plus in { + defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx12>; + + defm : VOP2bInstAliases< + V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx12, "v_add_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx12, "v_sub_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">; +} // End SubtargetPredicate = isGFX12Plus + +//===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> : + VOP2be_Real<GFX11Gen, op, opName, asmName>; + +// Only for CNDMASK +multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> : + VOP2e_Real<GFX11Gen, op, opName, asmName>; + +multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName, + string asmName, bit isSingle = 0> { + defm NAME : VOP2_Real_e32_with_name<GFX11Gen, op, opName, asmName, isSingle>, + VOP2_Real_dpp_with_name<GFX11Gen, op, opName, asmName>, + VOP2_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>; defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Only]>; } +multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName, + string asmName> : + VOP2_Real_NO_DPP_with_name<GFX11Gen, op, opName, asmName>; + +multiclass VOP2_Real_FULL_gfx11_gfx12<bits<6> op> : + VOP2_Real_FULL<GFX11Gen, op>, VOP2_Real_FULL<GFX12Gen, op>; + +multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName, + string asmName> : + VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, + VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> : + VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>; + +multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> : + VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>; + +multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName> : + VOP3Only_Realtriple_t16<GFX11Gen, op, asmName>, + VOP3Only_Realtriple_t16<GFX12Gen, op, asmName>; + +multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> : + VOP3beOnly_Realtriple<GFX11Gen, op>, VOP3beOnly_Realtriple<GFX12Gen, op>; + +multiclass VOP2Only_Real_MADK_with_name_gfx11_gfx12<bits<6> op, string asmName, + string opName = NAME> : + VOP2Only_Real_MADK_with_name<GFX11Gen, op, asmName, opName>, + VOP2Only_Real_MADK_with_name<GFX12Gen, op, asmName, opName>; + +multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName, + string opName = NAME> : + VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>; + +multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName, + string opName = NAME> : + VOP2_Real_FULL_with_name_gfx11_gfx12<op, opName, asmName>; + +multiclass VOP2_Real_FULL_gfx11<bits<6> op> : + VOP2_Real_FULL<GFX11Gen, op>; + defm V_CNDMASK_B32 : VOP2e_Real_gfx11<0x001, "V_CNDMASK_B32", "v_cndmask_b32">; defm V_DOT2ACC_F32_F16 : VOP2_Real_NO_VOP3_with_name_gfx11<0x002, "V_DOT2C_F32_F16", "v_dot2acc_f32_f16", 1>; defm V_FMAC_DX9_ZERO_F32 : VOP2_Real_NO_DPP_with_name_gfx11<0x006, "V_FMAC_LEGACY_F32", "v_fmac_dx9_zero_f32">; -defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11<0x007, +defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x007, "V_MUL_LEGACY_F32", "v_mul_dx9_zero_f32">; -defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11<0x018>; -defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11<0x019>; -defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11<0x01a>; +defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11_gfx12<0x018>; +defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11_gfx12<0x019>; +defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11_gfx12<0x01a>; defm V_ADD_CO_CI_U32 : VOP2be_Real_gfx11<0x020, "V_ADDC_U32", "v_add_co_ci_u32">; defm V_SUB_CO_CI_U32 : @@ -1486,37 +1663,43 @@ defm V_SUB_CO_CI_U32 : defm V_SUBREV_CO_CI_U32 : VOP2be_Real_gfx11<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; -defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11<0x02f, +defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f, "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">; -defm V_PK_FMAC_F16 : VOP2Only_Real_gfx11<0x03c>; - -defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x032, "v_add_f16">; -defm V_SUB_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x033, "v_sub_f16">; -defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x034, "v_subrev_f16">; -defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x035, "v_mul_f16">; -defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x036, "v_fmac_f16">; -defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03b, "v_ldexp_f16">; +defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx11_gfx12<0x03c>; + +defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">; +defm V_ADD_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">; +defm V_SUB_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x033, "v_sub_f16">; +defm V_SUB_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x033, "v_sub_f16">; +defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">; +defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">; +defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">; +defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">; +defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">; +defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">; defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">; +defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">; defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">; -defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x037, "v_fmamk_f16">; -defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x038, "v_fmaak_f16">; +defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">; +defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x037, "v_fmamk_f16">; +defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x038, "v_fmaak_f16">; // VOP3 only. -defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>; -defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>; -defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>; -defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>; -defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>; -defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>; -defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">; -defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">; -defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>; -defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>; -defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>; -defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>; -defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>; - -let SubtargetPredicate = isGFX11Plus in { +defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>; +defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x31c>; +defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31d>; +defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31e>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31f>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x320>; +defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">; +defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">; +defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11_gfx12<0x323>; +defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11_gfx12<0x324>; +defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x300>; +defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x301>; +defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x302>; + +let SubtargetPredicate = isGFX11Only in { defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>; defm : VOP2bInstAliases< @@ -1525,7 +1708,7 @@ let SubtargetPredicate = isGFX11Plus in { V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx11, "v_sub_co_ci_u32">; defm : VOP2bInstAliases< V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx11, "v_subrev_co_ci_u32">; -} // End SubtargetPredicate = isGFX11Plus +} // End SubtargetPredicate = isGFX11Only //===----------------------------------------------------------------------===// // GFX10. @@ -1747,7 +1930,10 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" multiclass VOP2Only_Real_MADK_gfx10_gfx11<bits<6> op> : - VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK_gfx11<op>; + VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK<GFX11Gen, op>; + +multiclass VOP2Only_Real_MADK_gfx10_gfx11_gfx12<bits<6> op> : + VOP2Only_Real_MADK_gfx10_gfx11<op>, VOP2Only_Real_MADK<GFX12Gen, op>; multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> : VOP2be_Real_e32_gfx10<op, opName, asmName>, @@ -1768,7 +1954,10 @@ multiclass VOP2_Real_gfx10<bits<6> op> : VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>; multiclass VOP2_Real_gfx10_gfx11<bits<6> op> : - VOP2_Real_gfx10<op>, VOP2_Real_FULL_gfx11<op>; + VOP2_Real_gfx10<op>, VOP2_Real_FULL<GFX11Gen, op>; + +multiclass VOP2_Real_gfx10_gfx11_gfx12<bits<6> op> : + VOP2_Real_gfx10_gfx11<op>, VOP2_Real_FULL<GFX12Gen, op>; multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName, string asmName> : @@ -1778,19 +1967,20 @@ multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName, VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>, VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>; -multiclass VOP2_Real_with_name_gfx10_gfx11<bits<6> op, string opName, - string asmName> : +multiclass VOP2_Real_with_name_gfx10_gfx11_gfx12<bits<6> op, string opName, + string asmName> : VOP2_Real_with_name_gfx10<op, opName, asmName>, - VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>; + VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>, + VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>; // NB: Same opcode as v_mac_legacy_f32 let DecoderNamespace = "GFX10_B" in defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>; -defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11<0x01e>; -defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11<0x02b>; -defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02c>; -defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02d>; +defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11_gfx12<0x01e>; +defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11_gfx12<0x02b>; +defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11_gfx12<0x02c>; +defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11_gfx12<0x02d>; defm V_ADD_F16 : VOP2_Real_gfx10<0x032>; defm V_SUB_F16 : VOP2_Real_gfx10<0x033>; defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>; @@ -1808,11 +1998,11 @@ let IsSingle = 1 in { // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : - VOP2_Real_with_name_gfx10_gfx11<0x025, "V_ADD_U32", "v_add_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11_gfx12<0x025, "V_ADD_U32", "v_add_nc_u32">; defm V_SUB_NC_U32 : - VOP2_Real_with_name_gfx10_gfx11<0x026, "V_SUB_U32", "v_sub_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11_gfx12<0x026, "V_SUB_U32", "v_sub_nc_u32">; defm V_SUBREV_NC_U32 : - VOP2_Real_with_name_gfx10_gfx11<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11_gfx12<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; // VOP2 carry-in, carry-out. defm V_ADD_CO_CI_U32 : @@ -1905,7 +2095,10 @@ multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> : VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>; multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<6> op> : - VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL_gfx11<op>; + VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL<GFX11Gen, op>; + +multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<6> op> : + VOP2_Real_gfx6_gfx7_gfx10_gfx11<op>, VOP2_Real_FULL<GFX12Gen, op>; multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> : VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>; @@ -1967,28 +2160,28 @@ let SubtargetPredicate = isGFX6GFX7 in { def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>; } // End SubtargetPredicate = isGFX6GFX7 -defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>; -defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>; -defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>; +defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x003>; +defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x004>; +defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x005>; defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; -defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x008>; -defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x009>; -defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00a>; -defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00b>; -defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00c>; +defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x008>; +defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x009>; +defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00a>; +defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00b>; +defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00c>; defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00f>; defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x010>; -defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x011>; -defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x012>; -defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x013>; -defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x014>; +defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x011>; +defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x012>; +defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x013>; +defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>; defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>; defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>; defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>; -defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01b>; -defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01c>; -defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01d>; +defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01b>; +defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01c>; +defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01d>; defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>; defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>; defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c0e0ac1b4ec8..eebd323210f9 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -144,11 +144,15 @@ defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_a let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>; +let SubtargetPredicate = isNotGFX12Plus in { defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>; defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>; +} // End SubtargetPredicate = isNotGFX12Plus } // End FPDPRounding = 1 +let SubtargetPredicate = isNotGFX12Plus in { defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like>; defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like>; +} // End SubtargetPredicate = isNotGFX12Plus } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteIntMul] in { @@ -157,6 +161,19 @@ defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>; } // End SchedRW = [WriteIntMul] + +let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { +defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>; +defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>; +defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>; +defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>; + +let SchedRW = [WriteDoubleAdd] in { +defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>; +defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>; +} // End SchedRW = [WriteDoubleAdd] +} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 + } // End isReMaterializable = 1 let Uses = [MODE, VCC, EXEC] in { @@ -207,6 +224,11 @@ let mayRaiseFPException = 0 in { defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; } // End mayRaiseFPException = 0 +let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { + defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfminimum3>; + defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>; +} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 + let isCommutable = 1 in { defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; @@ -254,10 +276,13 @@ let SchedRW = [Write64Bit] in { } // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX8Plus in { - defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>; defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>; defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>; } // End SubtargetPredicate = isGFX8Plus + + let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in { + defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>; + } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11 } // End SchedRW = [Write64Bit] } // End isReMaterializable = 1 @@ -515,6 +540,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>, let HasExtVOP3DPP = 0; } +def IsPow2Plus1: PatLeaf<(i32 imm), [{ + uint32_t V = N->getZExtValue(); + return isPowerOf2_32(V - 1); +}]>; + +def Log2_32: SDNodeXForm<imm, [{ + uint32_t V = N->getZExtValue(); + return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32); +}]>; + let SubtargetPredicate = isGFX9Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -538,6 +573,11 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3 defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>; defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>; +let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { + defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>; + defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>; +} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 + defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>; defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>; @@ -612,6 +652,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +def : GCNPat< + (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1), + (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; + let SubtargetPredicate = isGFX940Plus in def : GCNPat< (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2), @@ -664,11 +708,22 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> { >; } +// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. +// We need to separate this because otherwise OtherPredicates would be overriden. +class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat < + (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), + (inst $src0, $src1, $src2, 0 /* clamp */) + >; + // exclude pre-GFX9 where it was slow -let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in +let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats<V_MAD_U64_U32_e64>; -let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in + def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>; +} +let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>; + def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>; +} def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> { let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, @@ -680,6 +735,15 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3 let HasExtDPP = 0; } +def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped]>, VOP3_OPSEL> { + let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, + IntOpSelMods:$src1_modifiers, VRegSrc_32:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel); + let HasClamp = 0; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + def opsel_i1timm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant( N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE, @@ -696,6 +760,13 @@ class PermlanePat<SDPatternOperator permlane, SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; +class PermlaneVarPat<SDPatternOperator permlane, + Instruction inst> : GCNPat< + (permlane i32:$vdst_in, i32:$src0, i32:$src1, + timm:$fi, timm:$bc), + (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), + VGPR_32:$src1, VGPR_32:$vdst_in) +>; let SubtargetPredicate = isGFX10Plus in { let isCommutable = 1, isReMaterializable = 1 in { @@ -726,6 +797,17 @@ let SubtargetPredicate = isGFX10Plus in { } // End SubtargetPredicate = isGFX10Plus +let SubtargetPredicate = isGFX12Plus in { + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { + defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>; + defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>; + } // End $vdst = $vdst_in, DisableEncoding $vdst_in + + def : PermlaneVarPat<int_amdgcn_permlane16_var, V_PERMLANE16_VAR_B32_e64>; + def : PermlaneVarPat<int_amdgcn_permlanex16_var, V_PERMLANEX16_VAR_B32_e64>; + +} // End SubtargetPredicate = isGFX12Plus + class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), @@ -773,11 +855,61 @@ let SubtargetPredicate = isGFX11Plus in { defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; } // End SubtargetPredicate = isGFX11Plus +let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { + defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; + defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; + defm V_MAXIMUMMINIMUM_F16 : VOP3Inst<"v_maximumminimum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; + defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 + let SubtargetPredicate = HasDot9Insts, IsDOT=1 in { defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>; defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>; } +class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp, + ValueType dstVt, ValueType srcVt = dstVt> + : VOPProfile<[dstVt, srcVt, untyped, untyped]> { + let DstRC = VOPDstOperand<Dst>; + let Src0RC64 = SrcOp; + + let HasOMod = 1; + let HasModifiers = 1; +} + +def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>; +def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>; + +let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1, + isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in { + defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>; + defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>; + defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>; + defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>; + defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>; + defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>; + defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>; + defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>; + defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>; + defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>; +} + +class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat < + (f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)))), + (f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp, + i32:$omod)), + SReg_32_XEXEC)) +>; + +let SubtargetPredicate = HasPseudoScalarTrans in { + def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>; + def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>; + def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>; + def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>; + def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>; +} + //===----------------------------------------------------------------------===// // Integer Clamp Patterns //===----------------------------------------------------------------------===// @@ -823,125 +955,195 @@ def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>; //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// GFX11. +// GFX12. +//===----------------------------------------------------------------------===// + +defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">; +defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">; +defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">; +defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">; +defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>; +defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>; +defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>; +defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>; +defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">; +defm V_MED3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x232, "V_MED3_F16", "v_med3_num_f16">; +defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">; +defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">; +defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">; +defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">; +defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>; +defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>; +defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>; +defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>; +defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>; +defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>; +defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>; +defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>; +defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>; +defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>; +defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>; +defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>; +defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>; +defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>; +defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">; +defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">; +defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>; +defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>; +defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>; +defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>; +defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>; +defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>; + +defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; +defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; + +//===----------------------------------------------------------------------===// +// GFX11, GFX12 //===----------------------------------------------------------------------===// -defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; -defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11<0x20a>; -defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11<0x20b>; -defm V_CUBEID_F32 : VOP3_Realtriple_gfx11<0x20c>; -defm V_CUBESC_F32 : VOP3_Realtriple_gfx11<0x20d>; -defm V_CUBETC_F32 : VOP3_Realtriple_gfx11<0x20e>; -defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11<0x20f>; -defm V_BFE_U32 : VOP3_Realtriple_gfx11<0x210>; -defm V_BFE_I32 : VOP3_Realtriple_gfx11<0x211>; -defm V_BFI_B32 : VOP3_Realtriple_gfx11<0x212>; -defm V_FMA_F32 : VOP3_Realtriple_gfx11<0x213>; -defm V_FMA_F64 : VOP3_Real_Base_gfx11<0x214>; -defm V_LERP_U8 : VOP3_Realtriple_gfx11<0x215>; -defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11<0x216>; -defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11<0x217>; -defm V_MULLIT_F32 : VOP3_Realtriple_gfx11<0x218>; +multiclass VOP3_Real_with_name_gfx11_gfx12<bits<10> op, string opName, + string asmName> : + VOP3_Real_with_name<GFX11Gen, op, opName, asmName>, + VOP3_Real_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> : + VOP3_Realtriple<GFX11Gen, op>, VOP3_Realtriple<GFX12Gen, op>; + +multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> : + VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>; + +multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, + string asmName> : + VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>, + VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> : + VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>; + +multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> : + VOP3be_Real<GFX11Gen, op, opName, asmName>, + VOP3be_Real<GFX12Gen, op, opName, asmName>; + +multiclass VOP3_Real_No_Suffix_gfx11_gfx12<bits<10> op> : + VOP3_Real_No_Suffix<GFX11Gen, op>, VOP3_Real_No_Suffix<GFX12Gen, op>; + +defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11_gfx12<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; +defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11_gfx12<0x20a>; +defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11_gfx12<0x20b>; +defm V_CUBEID_F32 : VOP3_Realtriple_gfx11_gfx12<0x20c>; +defm V_CUBESC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20d>; +defm V_CUBETC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20e>; +defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x20f>; +defm V_BFE_U32 : VOP3_Realtriple_gfx11_gfx12<0x210>; +defm V_BFE_I32 : VOP3_Realtriple_gfx11_gfx12<0x211>; +defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>; +defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>; +defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12<0x214>; +defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>; +defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11_gfx12<0x216>; +defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11_gfx12<0x217>; +defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12<0x218>; defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>; -defm V_MIN3_I32 : VOP3_Realtriple_gfx11<0x21a>; -defm V_MIN3_U32 : VOP3_Realtriple_gfx11<0x21b>; +defm V_MIN3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21a>; +defm V_MIN3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21b>; defm V_MAX3_F32 : VOP3_Realtriple_gfx11<0x21c>; -defm V_MAX3_I32 : VOP3_Realtriple_gfx11<0x21d>; -defm V_MAX3_U32 : VOP3_Realtriple_gfx11<0x21e>; +defm V_MAX3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21d>; +defm V_MAX3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21e>; defm V_MED3_F32 : VOP3_Realtriple_gfx11<0x21f>; -defm V_MED3_I32 : VOP3_Realtriple_gfx11<0x220>; -defm V_MED3_U32 : VOP3_Realtriple_gfx11<0x221>; -defm V_SAD_U8 : VOP3_Realtriple_gfx11<0x222>; -defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11<0x223>; -defm V_SAD_U16 : VOP3_Realtriple_gfx11<0x224>; -defm V_SAD_U32 : VOP3_Realtriple_gfx11<0x225>; -defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11<0x226>; -defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11<0x227>; -defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11<0x228>; -defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11<0x237>; -defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11<0x238>; -defm V_MSAD_U8 : VOP3_Realtriple_gfx11<0x239>; -defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23a>; -defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23b>; -defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11<0x23d>; -defm V_XOR3_B32 : VOP3_Realtriple_gfx11<0x240>; -defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11<0x241, "V_MAD_U16_gfx9", "v_mad_u16">; -defm V_PERM_B32 : VOP3_Realtriple_gfx11<0x244>; -defm V_XAD_U32 : VOP3_Realtriple_gfx11<0x245>; -defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11<0x246>; -defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11<0x247>; -defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11<0x248, "V_FMA_F16_gfx9", "v_fma_f16">; +defm V_MED3_I32 : VOP3_Realtriple_gfx11_gfx12<0x220>; +defm V_MED3_U32 : VOP3_Realtriple_gfx11_gfx12<0x221>; +defm V_SAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x222>; +defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11_gfx12<0x223>; +defm V_SAD_U16 : VOP3_Realtriple_gfx11_gfx12<0x224>; +defm V_SAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x225>; +defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11_gfx12<0x226>; +defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11_gfx12<0x227>; +defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12<0x228>; +defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11_gfx12<0x237>; +defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12<0x238>; +defm V_MSAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x239>; +defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>; +defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>; +defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>; +defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x241, "V_MAD_U16_gfx9", "v_mad_u16">; +defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>; +defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>; +defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>; +defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>; +defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">; defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>; -defm V_MIN3_I16 : VOP3_Realtriple_gfx11<0x24a>; -defm V_MIN3_U16 : VOP3_Realtriple_gfx11<0x24b>; +defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>; +defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>; defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>; -defm V_MAX3_I16 : VOP3_Realtriple_gfx11<0x24d>; -defm V_MAX3_U16 : VOP3_Realtriple_gfx11<0x24e>; +defm V_MAX3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24d>; +defm V_MAX3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24e>; defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>; -defm V_MED3_I16 : VOP3_Realtriple_gfx11<0x250>; -defm V_MED3_U16 : VOP3_Realtriple_gfx11<0x251>; -defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11<0x253, "V_MAD_I16_gfx9", "v_mad_i16">; -defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; -defm V_ADD3_U32 : VOP3_Realtriple_gfx11<0x255>; -defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11<0x256>; -defm V_AND_OR_B32 : VOP3_Realtriple_gfx11<0x257>; -defm V_OR3_B32 : VOP3_Realtriple_gfx11<0x258>; -defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11<0x259>; -defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11<0x25a>; -defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11<0x25b>; -defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11<0x25c>; +defm V_MED3_I16 : VOP3_Realtriple_gfx11_gfx12<0x250>; +defm V_MED3_U16 : VOP3_Realtriple_gfx11_gfx12<0x251>; +defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x253, "V_MAD_I16_gfx9", "v_mad_i16">; +defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; +defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>; +defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>; +defm V_AND_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x257>; +defm V_OR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x258>; +defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11_gfx12<0x259>; +defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11_gfx12<0x25a>; +defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>; +defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>; defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>; defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>; -defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>; -defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>; -defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>; -defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>; -defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>; -defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; -defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; -defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; +defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>; +defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>; +defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>; +defm V_MINMAX_I32 : VOP3_Realtriple_gfx11_gfx12<0x265>; +defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11_gfx12<0x266>; +defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11_gfx12<0x267>; +defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; +defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; -defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>; -defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>; -defm V_MUL_LO_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x305, "v_mul_lo_u16">; -defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11<0x306>; -defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11<0x307>; -defm V_MAX_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x309, "v_max_u16">; -defm V_MAX_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30a, "v_max_i16">; -defm V_MIN_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30b, "v_min_u16">; -defm V_MIN_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30c, "v_min_i16">; -defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30d, "V_ADD_I16", "v_add_nc_i16">; -defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30e, "V_SUB_I16", "v_sub_nc_i16">; -defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11<0x311>; -defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >; -defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >; -defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x325, "V_SUB_I32", "v_sub_nc_i32">; -defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x326, "V_ADD_I32", "v_add_nc_i32">; +defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x303>; +defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x304>; +defm V_MUL_LO_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x305, "v_mul_lo_u16">; +defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12<0x306>; +defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12<0x307>; +defm V_MAX_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x309, "v_max_u16">; +defm V_MAX_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30a, "v_max_i16">; +defm V_MIN_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30b, "v_min_u16">; +defm V_MIN_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30c, "v_min_i16">; +defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">; +defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">; +defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11_gfx12<0x311>; +defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >; +defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >; +defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x325, "V_SUB_I32", "v_sub_nc_i32">; +defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x326, "V_ADD_I32", "v_add_nc_i32">; defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>; defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; -defm V_LDEXP_F64 : VOP3_Real_Base_gfx11<0x32b>; -defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11<0x32c>; -defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11<0x32d>; -defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11<0x32e>; -defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11<0x32f>; -defm V_LSHLREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x338, "v_lshlrev_b16">; -defm V_LSHRREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x339, "v_lshrrev_b16">; -defm V_ASHRREV_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x33a, "v_ashrrev_i16">; +defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>; +defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>; +defm V_LSHLREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x338, "v_lshlrev_b16">; +defm V_LSHRREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x339, "v_lshrrev_b16">; +defm V_ASHRREV_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x33a, "v_ashrrev_i16">; defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>; -defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11<0x33d>; -defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11<0x33e>; -defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11<0x360>; // Pseudo in VOP2 +defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11_gfx12<0x33d>; +defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11_gfx12<0x33e>; +defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x360>; // Pseudo in VOP2 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { - defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11<0x361>; // Pseudo in VOP2 + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x361>; // Pseudo in VOP2 } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) -defm V_AND_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x362, "v_and_b16">; -defm V_OR_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x363, "v_or_b16">; -defm V_XOR_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x364, "v_xor_b16">; +defm V_AND_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x362, "v_and_b16">; +defm V_OR_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x363, "v_or_b16">; +defm V_XOR_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x364, "v_xor_b16">; //===----------------------------------------------------------------------===// // GFX10. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 71e09611e74e..d3cefb339d9e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -108,6 +108,11 @@ defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I1 defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>; defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>; defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>; + +let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { +defm V_PK_MAXIMUM_F16 : VOP3PInst<"v_pk_maximum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaximum>; +defm V_PK_MINIMUM_F16 : VOP3PInst<"v_pk_minimum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminimum>; +} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 } defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>; @@ -353,56 +358,51 @@ foreach Type = ["I", "U"] in (!cast<Extract>(Type#Index#"_4bit") node:$src1))>; } -class UDot2Pat<Instruction Inst> : GCNPat < +class UDot2Pat<VOP_Pseudo Inst> : GCNPat < (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), (srl i32:$src1, (i32 16))), i32:$src2), (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), (and i32:$src1, (i32 65535))) ), (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { - let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; + let Predicates = Inst.Predicates; } -class SDot2Pat<Instruction Inst> : GCNPat < +class SDot2Pat<VOP_Pseudo Inst> : GCNPat < (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), (sra i32:$src1, (i32 16))), i32:$src2), (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), (sext_inreg i32:$src1, i16))), (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { - let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; + let Predicates = Inst.Predicates; } let IsDOT = 1 in { -let SubtargetPredicate = HasDot2Insts in { - +let OtherPredicates = [HasDot2Insts] in { defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; +} // End OtherPredicates = [HasDot2Insts] -} // End SubtargetPredicate = HasDot2Insts - -let SubtargetPredicate = HasDot10Insts in +let OtherPredicates = [HasDot10Insts] in defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, AMDGPUfdot2, 1/*ExplicitClamp*/>; -let SubtargetPredicate = HasDot7Insts in { +let OtherPredicates = [HasDot7Insts] in { defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; +} // End OtherPredicates = [HasDot7Insts] -} // End SubtargetPredicate = HasDot7Insts - -let SubtargetPredicate = HasDot1Insts in { - +let OtherPredicates = [HasDot1Insts] in { defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; - -} // End SubtargetPredicate = HasDot1Insts +} // End OtherPredicates = [HasDot1Insts] def DOT2_BF16_Profile : VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1> { @@ -436,20 +436,34 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { let SubtargetPredicate = HasDot8Insts in { defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; + +def : GCNPat < (int_amdgcn_sdot8 i32:$src0, + i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (V_DOT8_I32_IU4 (i32 9), i32:$src0, + (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) +>; + +def : GCNPat < (int_amdgcn_sdot4 i32:$src0, + i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (V_DOT4_I32_IU8 (i32 9), i32:$src0, + (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) +>; } // End SubtargetPredicate = HasDot8Insts def : UDot2Pat<V_DOT2_U32_U16>; def : SDot2Pat<V_DOT2_I32_I16>; foreach Type = ["U", "I"] in - let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in + let Predicates = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).Predicates in def : GCNPat < !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; foreach Type = ["U", "I"] in - let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in + let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in def : GCNPat < !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, @@ -459,7 +473,7 @@ foreach Type = ["U", "I"] in // Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase // in the compile time. Directly handle the pattern generated by the FE here. foreach Type = ["U", "I"] in - let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in + let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in def : GCNPat < !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), [7, 1, 2, 3, 4, 5, 6], lhs, y, @@ -596,7 +610,7 @@ let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeed class VgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>; -let Predicates = [HasMAIInsts] in { +let SubtargetPredicate = HasMAIInsts in { let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -687,7 +701,7 @@ let Predicates = [isGFX90APlus] in { } } // End Predicates = [isGFX90APlus] -let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in { +let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in { defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>; defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>; defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>; @@ -700,7 +714,7 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in { defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>; defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>; defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>; -} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1 +} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> { let Constraints = "$vdst = $src2", DisableEncoding = "$src2", @@ -737,12 +751,16 @@ def MAIInstInfoTable : GenericTable { let PrimaryKeyName = "getMAIInstInfoHelper"; } -let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in { - defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; - defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; - defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; +let isCommutable = 1, isReMaterializable = 1 in { + let SubtargetPredicate = HasPackedFP32Ops in { + defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; + defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; + defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; + } // End SubtargetPredicate = HasPackedFP32Ops + + let SubtargetPredicate = HasPkMovB32 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; -} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 +} // End isCommutable = 1, isReMaterializable = 1 def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; @@ -847,34 +865,25 @@ def WMMAOpcode3AddrMappingTable : WMMAMappingTable { // it converts the default pseudo to the pseudo where src2 is not the same as vdst. // 3) @earlyclobber on the destination satisfies the constraint during RA. -multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> { +multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> { defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>; - if !eq(Suffix, "_w32") then { - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { - def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; - } - let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { - def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; - } + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { + def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; } - def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32), - !cast<Instruction>(NAME # _threeaddr_w32)>; - } else if !eq(Suffix, "_w64") then { + } + if convertibleTo3Addr then { let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { - def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; - } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { - def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; + def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; } } - def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64), - !cast<Instruction>(NAME # _threeaddr_w64)>; + def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix), + !cast<Instruction>(NAME # _threeaddr # Suffix)>; } if !eq(Type, WMMAOpSel) then { @@ -888,21 +897,25 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator let WaveSizePredicate = isWave32 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; - defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; } let WaveSizePredicate = isWave64 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; - defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; } @@ -932,56 +945,89 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> } //===----------------------------------------------------------------------===// -// GFX11. +// GFX11, GFX12 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Plus, - DecoderNamespace = "GFX11" in { +multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + def Gen.Suffix : + VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>, + VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>; +} - multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME, - string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { - def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name), - SIEncodingFamily.GFX11, asmName>, - VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>; - } +multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op, + string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); + let AsmString = asmName # ps.AsmOperands in + def Gen.Suffix : + VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>, + VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>; +} - multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME, - string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { - defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); - def _dpp_gfx11 - : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), - SIEncodingFamily.GFX11> { - let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPPGFX11"; - } +multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); + def _dpp#Gen.Suffix + : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), + Gen.Subtarget> { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; } +} - multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME, - string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { - defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); - def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> { - let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8GFX11"; - } +multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); + def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; } +} - multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME, - string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> - : VOP3P_Real_gfx11<op, backing_ps_name, asmName>, - VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>, - VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>; -} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" +multiclass VOP3P_Realtriple<GFXGen Gen, bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> + : VOP3P_Real_Base<Gen, op, backing_ps_name, asmName>, + VOP3P_Real_dpp<Gen, op, backing_ps_name, asmName>, + VOP3P_Real_dpp8<Gen, op, backing_ps_name, asmName>; -defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>; -defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>; -defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>; +//===----------------------------------------------------------------------===// +// GFX12 +//===----------------------------------------------------------------------===// + +multiclass VOP3P_Real_gfx12<bits<7> op> : VOP3P_Real_Base<GFX12Gen, op>; + +multiclass VOP3P_Real_with_name_gfx12<bits<7> op, + string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> : + VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>; + +defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">; +defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">; + +defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; +defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; + +//===----------------------------------------------------------------------===// +// GFX11 +//===----------------------------------------------------------------------===// + +multiclass VOP3P_Real_gfx11_gfx12<bits<7> op> : + VOP3P_Real_Base<GFX11Gen, op>, VOP3P_Real_Base<GFX12Gen, op>; + +defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11_gfx12<0x16>; +defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11_gfx12<0x18>; +defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11_gfx12<0x1a>; multiclass VOP3P_Real_WMMA <bits<7> op> { let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in { - defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>; + defm _twoaddr_w32 : VOP3P_Real_Base <GFX11Gen, op>; } let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in { - defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>; + defm _twoaddr_w64 : VOP3P_Real_Base <GFX11Gen, op>; } } @@ -1034,25 +1080,23 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"), VOPProfile Pfl_ACD = PS_ACD.Pfl, VOPProfile Pfl_VCD = PS_VCD.Pfl> { - let Predicates = [isGFX940Plus] in { - if !ne(NameFrom, NameTo) then { - def : InstAlias <NameTo # " " # PS_ACD.AsmOperands, - (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst, - Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2, - cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; - def : InstAlias <NameTo # " " # PS_VCD.AsmOperands, - (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst, - Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2, - cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; - } - } // End Predicates = [isGFX940Plus] + if !ne(NameFrom, NameTo) then { + def : InstAlias <NameTo # " " # PS_ACD.AsmOperands, + (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst, + Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2, + cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; + def : InstAlias <NameTo # " " # PS_VCD.AsmOperands, + (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst, + Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2, + cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; + } } multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { let SubtargetPredicate = isGFX940Plus, - AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940", + DecoderNamespace = "GFX940", AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, VOP3Pe_MAI <op, PS_ACD.Pfl, 1>; @@ -1061,23 +1105,32 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N VOP3Pe_MAI <op, PS_VCD.Pfl, 0>; } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940" - defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>; + let SubtargetPredicate = isGFX940Plus in { + defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>; - if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then - defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>; + if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then + defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>; + } } -multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : - VOP3P_Real_MFMA_gfx90a <op>, - VOP3P_Real_MFMA_gfx940 <op, GFX940Name> { +multiclass VOP3P_Real_MFMA_vi<bits<7> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { + let SubtargetPredicate = isGFX8GFX9NotGFX90A; let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; let Constraints = ""; } } +multiclass VOP3P_Real_MFMA_vi_gfx90a<bits<7> op> : + VOP3P_Real_MFMA_gfx90a <op>, + VOP3P_Real_MFMA_vi <op>; + +multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3P_Real_MFMA_vi_gfx90a <op>, + VOP3P_Real_MFMA_gfx940 <op, GFX940Name>; + multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> { def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, VOP3Pe_SMFMAC <op> { @@ -1087,6 +1140,7 @@ multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> { def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>; } +let SubtargetPredicate = isGFX8GFX9 in { defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>; @@ -1108,15 +1162,14 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>; defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>; defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>; - -let SubtargetPredicate = HasMadMixInsts in { +let OtherPredicates = [HasMadMixInsts] in { defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>; defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>; defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>; } -let SubtargetPredicate = HasFmaMixInsts in { -let DecoderNamespace = "GFX9_DL" in { +let OtherPredicates = [HasFmaMixInsts], + DecoderNamespace = "GFX9_DL" in { // The mad_mix instructions were renamed and their behaviors changed, // but the opcode stayed the same so we need to put these in a // different DecoderNamespace to avoid the ambiguity. @@ -1124,8 +1177,6 @@ defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>; defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>; defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>; } -} - defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; @@ -1136,8 +1187,9 @@ defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>; defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>; +} // End SubtargetPredicate = isGFX8GFX9 -let SubtargetPredicate = HasMAIInsts in { +let OtherPredicates = [HasMAIInsts] in { defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>; defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>; @@ -1155,17 +1207,15 @@ defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8"> defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">; defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">; -let SubtargetPredicate = isGFX908orGFX90A in { -defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>; -defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>; -defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>; -defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>; -defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>; -defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>; -defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>; -} +defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA_vi_gfx90a <0x55>; +defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA_vi_gfx90a <0x54>; +defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x68>; +defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x69>; +defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6b>; +defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6c>; +defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6d>; -} // End SubtargetPredicate = HasMAIInsts +} // End OtherPredicates = [HasMAIInsts] defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>; defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>; @@ -1212,12 +1262,10 @@ defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x3 defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">; defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">; -let SubtargetPredicate = HasPackedFP32Ops in { - defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; - defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; - defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; - defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; -} // End SubtargetPredicate = HasPackedFP32Ops +defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; +defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; +defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; +defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; //===----------------------------------------------------------------------===// // GFX10. @@ -1230,41 +1278,45 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 -multiclass VOP3P_Real_gfx10_gfx11<bits<7> op> - : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>; - -multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op> - : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>; - -defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>; -defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>; -defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>; -defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>; -defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>; -defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>; -defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>; -defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>; -defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>; -defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>; -defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>; -defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>; -defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>; -defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>; -defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>; -defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>; -defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>; +multiclass VOP3P_Real_gfx10_gfx11<bits<7> op> : + VOP3P_Real_gfx10<op>, VOP3P_Real_Base<GFX11Gen, op>; + +multiclass VOP3P_Real_gfx10_gfx11_gfx12<bits<7> op> : + VOP3P_Real_gfx10_gfx11<op>, VOP3P_Real_Base<GFX12Gen, op>; + +multiclass VOP3P_Real_gfx10_gfx11_gfx12_Triple<bits<7> op> : + VOP3P_Real_gfx10<op>, VOP3P_Realtriple<GFX11Gen, op>, + VOP3P_Realtriple<GFX12Gen, op>; + +defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x00>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x01>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x02>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x03>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x04>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x05>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x06>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x07>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x08>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x09>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x10>; defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>; defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>; -defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>; -defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>; -defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x20>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x21>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x22>; defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; -defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>; -defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>; -defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>; +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x13>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11_gfx12<0x17>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11_gfx12<0x19>; defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>; defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 6fc3d0957dce..e5b801048e6d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1081,6 +1081,8 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { } } +defm : FCMP_Pattern <COND_O, V_CMP_O_F32_e64, f32>; +defm : FCMP_Pattern <COND_UO, V_CMP_U_F32_e64, f32>; defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>; defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>; @@ -1088,6 +1090,8 @@ defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>; defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>; defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>; +defm : FCMP_Pattern <COND_O, V_CMP_O_F64_e64, f64>; +defm : FCMP_Pattern <COND_UO, V_CMP_U_F64_e64, f64>; defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>; defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>; defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>; @@ -1110,6 +1114,8 @@ defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; let OtherPredicates = [HasTrue16BitInsts] in { +defm : FCMP_Pattern <COND_O, V_CMP_O_F16_t16_e64, f16>; +defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_t16_e64, f16>; defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_t16_e64, f16>; defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_t16_e64, f16>; defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_t16_e64, f16>; @@ -1126,6 +1132,8 @@ defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_t16_e64, f16>; } // End OtherPredicates = [HasTrue16BitInsts] let OtherPredicates = [NotHasTrue16BitInsts] in { +defm : FCMP_Pattern <COND_O, V_CMP_O_F16_e64, f16>; +defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_e64, f16>; defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>; defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>; defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>; @@ -1315,53 +1323,52 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// GFX11. +// GFX11, GFX12 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Only in { - multiclass VOPC_Real_gfx11<bits<9> op> { +multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> { + let AssemblerPredicate = Gen.AssemblerPredicate in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64"); - let DecoderNamespace = "GFX11" in { - def _e32_gfx11 : VOPC_Real<ps32, SIEncodingFamily.GFX11>, - VOPCe<op{7-0}>; - def _e64_gfx11 : VOP3_Real<ps64, SIEncodingFamily.GFX11>, - VOP3a_gfx11<{0, op}, ps64.Pfl> { + let DecoderNamespace = Gen.DecoderNamespace in { + def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>, + VOPCe<op{7-0}>; + def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { // Encoding used for VOPC instructions encoded as VOP3 differs from // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. bits<8> sdst; let Inst{7-0} = sdst; } - } // End DecoderNamespace = "GFX11" + } // End DecoderNamespace = Gen.DecoderNamespace - defm : VOPCInstAliases<NAME, "gfx11">; + defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, - SIEncodingFamily.GFX11>; - def _e32_dpp_w32_gfx11 : VOPC_DPP16<op{7-0}, psDPP> { + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>; + def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e32_dpp_w64_gfx11 : VOPC_DPP16<op{7-0}, psDPP> { + def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> { let AsmString = psDPP.OpName # " vcc, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave64; } } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32>; - def _e32_dpp8_w32_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>; + def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e32_dpp8_w64_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { let AsmString = ps32.OpName # " vcc, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave64; @@ -1371,83 +1378,84 @@ let AssemblerPredicate = isGFX11Only in { if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>; - def _e64_dpp_w32_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e64_dpp_w64_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> { + def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> { let AsmString = psDPP.OpName # " vcc, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave64; } } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64>; - def _e64_dpp8_w32_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>; + def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e64_dpp8_w64_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> { + def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> { let AsmString = ps32.OpName # " vcc, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave64; } } } + } // AssemblerPredicate = Gen.AssemblerPredicate +} - } - - multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName, - string asm_name, string pseudo_mnemonic = ""> { +multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName, + string asm_name, string pseudo_mnemonic = ""> { + let AssemblerPredicate = Gen.AssemblerPredicate in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64"); - let DecoderNamespace = "GFX11" in { - def _e32_gfx11 : + let DecoderNamespace = Gen.DecoderNamespace in { + def _e32#Gen.Suffix : // 32 and 64 bit forms of the instruction have _e32 and _e64 // respectively appended to their assembly mnemonic. // _e64 is printed as part of the VOPDstS64orS32 operand, whereas // the destination-less 32bit forms add it to the asmString here. - VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name#"_e32">, + VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">, VOPCe<op{7-0}>, MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic, pseudo_mnemonic), asm_name, ps32.AsmVariantName>, - Requires<[isGFX11Plus]>; - def _e64_gfx11 : - VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>, - VOP3a_gfx11<{0, op}, ps64.Pfl>, + Requires<[Gen.AssemblerPredicate]>; + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget, asm_name>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>, MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic, pseudo_mnemonic), asm_name, ps64.AsmVariantName>, - Requires<[isGFX11Plus]> { + Requires<[Gen.AssemblerPredicate]> { // Encoding used for VOPC instructions encoded as VOP3 differs from // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. bits<8> sdst; let Inst{7-0} = sdst; } - } // End DecoderNamespace = "GFX11" + } // End DecoderNamespace = Gen.DecoderNamespace - defm : VOPCInstAliases<OpName, "gfx11", NAME, asm_name>; + defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, - SIEncodingFamily.GFX11, asm_name>; - def _e32_dpp_w32_gfx11 + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; + def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP, asm_name> { let AsmString = asm_name # " vcc_lo, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e32_dpp_w64_gfx11 + def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP, asm_name> { let AsmString = asm_name # " vcc, " # AsmDPP; let isAsmParserOnly = 1; @@ -1455,15 +1463,15 @@ let AssemblerPredicate = isGFX11Only in { } } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>; - def _e32_dpp8_w32_gfx11 + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; + def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name> { let AsmString = asm_name # " vcc_lo, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e32_dpp8_w64_gfx11 + def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name> { let AsmString = asm_name # " vcc, " # AsmDPP8; let isAsmParserOnly = 1; @@ -1475,16 +1483,16 @@ let AssemblerPredicate = isGFX11Only in { if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>; - def _e64_dpp_w32_gfx11 + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>; + def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { let AsmString = asm_name # " vcc_lo, " # AsmDPP; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e64_dpp_w64_gfx11 + def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { let AsmString = asm_name # " vcc, " # AsmDPP; let isAsmParserOnly = 1; @@ -1492,15 +1500,15 @@ let AssemblerPredicate = isGFX11Only in { } } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; - def _e64_dpp8_w32_gfx11 + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; + def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { let AsmString = asm_name # " vcc_lo, " # AsmDPP8; let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - def _e64_dpp8_w64_gfx11 + def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { let AsmString = asm_name # " vcc, " # AsmDPP8; let isAsmParserOnly = 1; @@ -1508,44 +1516,47 @@ let AssemblerPredicate = isGFX11Only in { } } } - } + } // AssemblerPredicate = Gen.AssemblerPredicate +} - multiclass VOPC_Real_t16_gfx11<bits<9> op, string asm_name, - string OpName = NAME> : VOPC_Real_with_name_gfx11<op, OpName, asm_name>; +multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>; - multiclass VOPCX_Real_gfx11<bits<9> op> { +multiclass VOPCX_Real<GFXGen Gen, bits<9> op> { + let AssemblerPredicate = Gen.AssemblerPredicate in { defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64"); - let DecoderNamespace = "GFX11" in { - def _e32_gfx11 : - VOPC_Real<ps32, SIEncodingFamily.GFX11>, + let DecoderNamespace = Gen.DecoderNamespace in { + def _e32#Gen.Suffix : + VOPC_Real<ps32, Gen.Subtarget>, VOPCe<op{7-0}> { let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) # " " # ps32.AsmOperands; } - def _e64_gfx11 : - VOP3_Real<ps64, SIEncodingFamily.GFX11>, - VOP3a_gfx11<{0, op}, ps64.Pfl> { + def _e64#Gen.Suffix : + VOP3_Real<ps64, Gen.Subtarget>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { let Inst{7-0} = ?; // sdst let AsmString = !subst("_nosdst", "", ps64.Mnemonic) # "{_e64} " # ps64.AsmOperands; } - } // End DecoderNamespace = "GFX11" + } // End DecoderNamespace = Gen.DecoderNamespace - defm : VOPCXInstAliases<NAME, "gfx11">; + defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e32_dpp_gfx11 - : VOPC_DPP16_SIMC<op{7-0}, psDPP, SIEncodingFamily.GFX11> { + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e32_dpp#Gen.Suffix + : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> { let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; } } defvar AsmDPP8 = ps32.Pfl.AsmDPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> { let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; } } @@ -1554,268 +1565,305 @@ let AssemblerPredicate = isGFX11Only in { if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e64_dpp_gfx11 + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_NoDst<{0, op}, psDPP>, - SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> { + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { let AsmString = !subst("_nosdst", "", psDPP.OpName) # "{_e64_dpp} " # AsmDPP; } } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> { let AsmString = !subst("_nosdst", "", ps64.OpName) # "{_e64_dpp} " # AsmDPP8; } } } - } + } // AssemblerPredicate = Gen.AssemblerPredicate +} - multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName, - string asm_name, string pseudo_mnemonic = ""> { +multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName, + string asm_name, string pseudo_mnemonic = ""> { + let AssemblerPredicate = Gen.AssemblerPredicate in { defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32"); defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64"); - let DecoderNamespace = "GFX11" in { - def _e32_gfx11 - : VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name>, + let DecoderNamespace = Gen.DecoderNamespace in { + def _e32#Gen.Suffix + : VOPC_Real<ps32, Gen.Subtarget, asm_name>, MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic), pseudo_mnemonic), asm_name, ps32.AsmVariantName>, - Requires<[isGFX11Plus]>, + Requires<[Gen.AssemblerPredicate]>, VOPCe<op{7-0}> { let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; } - def _e64_gfx11 - : VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>, + def _e64#Gen.Suffix + : VOP3_Real<ps64, Gen.Subtarget, asm_name>, MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic), pseudo_mnemonic), asm_name, ps64.AsmVariantName>, - Requires<[isGFX11Plus]>, - VOP3a_gfx11<{0, op}, ps64.Pfl> { + Requires<[Gen.AssemblerPredicate]>, + VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> { let Inst{7-0} = ? ; // sdst let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; } - } // End DecoderNamespace = "GFX11" + } // End DecoderNamespace = Gen.DecoderNamespace - defm : VOPCXInstAliases<OpName, "gfx11", NAME, asm_name>; + defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>; if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp"); - let DecoderNamespace = "DPPGFX11" in { - def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, - SIEncodingFamily.GFX11, asm_name>; + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, + Gen.Subtarget, asm_name>; } - let DecoderNamespace = "DPP8GFX11" in { - def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>; + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>; } } if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; - let DecoderNamespace = "DPPGFX11" in { - def _e64_dpp_gfx11 + let DecoderNamespace = "DPP"#Gen.DecoderNamespace in { + def _e64_dpp#Gen.Suffix : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, - SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> { + SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> { let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; } } defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; - let DecoderNamespace = "DPP8GFX11" in { - def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in { + def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; } } } - } + } // AssemblerPredicate = Gen.AssemblerPredicate +} - multiclass VOPCX_Real_t16_gfx11<bits<9> op, string asm_name, - string OpName = NAME> : VOPCX_Real_with_name_gfx11<op, OpName, asm_name>; +multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPCX_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>; +multiclass VOPC_Real_gfx11<bits<9> op> : VOPC_Real_Base<GFX11Gen, op>; -} // End AssemblerPredicate = isGFX11Only +multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName, string asm_name, + string pseudo_mnemonic = ""> + : VOPC_Real_with_name<GFX11Gen, op, OpName, asm_name, pseudo_mnemonic>; + +multiclass VOPCX_Real_gfx11<bits<9> op> : VOPCX_Real<GFX11Gen, op>; + +multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName, + string asm_name, string pseudo_mnemonic = ""> : + VOPCX_Real_with_name<GFX11Gen, op, OpName, asm_name, pseudo_mnemonic>; + +multiclass VOPC_Real_gfx11_gfx12<bits<9> op> : + VOPC_Real_Base<GFX11Gen, op>, VOPC_Real_Base<GFX12Gen, op>; + +multiclass VOPCX_Real_gfx11_gfx12<bits<9> op> : + VOPCX_Real<GFX11Gen, op>, VOPCX_Real<GFX12Gen, op>; + +multiclass VOPC_Real_t16_gfx11<bits <9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPC_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>; + +multiclass VOPC_Real_t16_gfx11_gfx12<bits <9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPC_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>, + VOPC_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>; + +multiclass VOPCX_Real_t16_gfx11<bits<9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>; + +multiclass VOPCX_Real_t16_gfx11_gfx12<bits<9> op, string asm_name, + string OpName = NAME, string pseudo_mnemonic = ""> : + VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>, + VOPCX_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>; defm V_CMP_F_F16_t16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">; -defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11<0x001, "v_cmp_lt_f16">; -defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11<0x002, "v_cmp_eq_f16">; -defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11<0x003, "v_cmp_le_f16">; -defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11<0x004, "v_cmp_gt_f16">; -defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11<0x005, "v_cmp_lg_f16">; -defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11<0x006, "v_cmp_ge_f16">; -defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11<0x007, "v_cmp_o_f16">; -defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11<0x008, "v_cmp_u_f16">; -defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11<0x009, "v_cmp_nge_f16">; -defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11<0x00a, "v_cmp_nlg_f16">; -defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11<0x00b, "v_cmp_ngt_f16">; -defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11<0x00c, "v_cmp_nle_f16">; -defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11<0x00d, "v_cmp_neq_f16">; -defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11<0x00e, "v_cmp_nlt_f16">; +defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">; +defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">; +defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">; +defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">; +defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">; +defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">; +defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">; +defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">; +defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">; +defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">; +defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">; +defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">; +defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">; +defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">; defm V_CMP_T_F16_t16 : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16_t16", "v_cmp_t_f16", "v_cmp_tru_f16">; defm V_CMP_F_F32 : VOPC_Real_gfx11<0x010>; -defm V_CMP_LT_F32 : VOPC_Real_gfx11<0x011>; -defm V_CMP_EQ_F32 : VOPC_Real_gfx11<0x012>; -defm V_CMP_LE_F32 : VOPC_Real_gfx11<0x013>; -defm V_CMP_GT_F32 : VOPC_Real_gfx11<0x014>; -defm V_CMP_LG_F32 : VOPC_Real_gfx11<0x015>; -defm V_CMP_GE_F32 : VOPC_Real_gfx11<0x016>; -defm V_CMP_O_F32 : VOPC_Real_gfx11<0x017>; -defm V_CMP_U_F32 : VOPC_Real_gfx11<0x018>; -defm V_CMP_NGE_F32 : VOPC_Real_gfx11<0x019>; -defm V_CMP_NLG_F32 : VOPC_Real_gfx11<0x01a>; -defm V_CMP_NGT_F32 : VOPC_Real_gfx11<0x01b>; -defm V_CMP_NLE_F32 : VOPC_Real_gfx11<0x01c>; -defm V_CMP_NEQ_F32 : VOPC_Real_gfx11<0x01d>; -defm V_CMP_NLT_F32 : VOPC_Real_gfx11<0x01e>; +defm V_CMP_LT_F32 : VOPC_Real_gfx11_gfx12<0x011>; +defm V_CMP_EQ_F32 : VOPC_Real_gfx11_gfx12<0x012>; +defm V_CMP_LE_F32 : VOPC_Real_gfx11_gfx12<0x013>; +defm V_CMP_GT_F32 : VOPC_Real_gfx11_gfx12<0x014>; +defm V_CMP_LG_F32 : VOPC_Real_gfx11_gfx12<0x015>; +defm V_CMP_GE_F32 : VOPC_Real_gfx11_gfx12<0x016>; +defm V_CMP_O_F32 : VOPC_Real_gfx11_gfx12<0x017>; +defm V_CMP_U_F32 : VOPC_Real_gfx11_gfx12<0x018>; +defm V_CMP_NGE_F32 : VOPC_Real_gfx11_gfx12<0x019>; +defm V_CMP_NLG_F32 : VOPC_Real_gfx11_gfx12<0x01a>; +defm V_CMP_NGT_F32 : VOPC_Real_gfx11_gfx12<0x01b>; +defm V_CMP_NLE_F32 : VOPC_Real_gfx11_gfx12<0x01c>; +defm V_CMP_NEQ_F32 : VOPC_Real_gfx11_gfx12<0x01d>; +defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>; defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; -defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11<0x031, "v_cmp_lt_i16">; -defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11<0x032, "v_cmp_eq_i16">; -defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11<0x033, "v_cmp_le_i16">; -defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11<0x034, "v_cmp_gt_i16">; -defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11<0x035, "v_cmp_ne_i16">; -defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11<0x036, "v_cmp_ge_i16">; -defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11<0x039, "v_cmp_lt_u16">; -defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11<0x03a, "v_cmp_eq_u16">; -defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11<0x03b, "v_cmp_le_u16">; -defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11<0x03c, "v_cmp_gt_u16">; -defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11<0x03d, "v_cmp_ne_u16">; -defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11<0x03e, "v_cmp_ge_u16">; +defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">; +defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">; +defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">; +defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">; +defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">; +defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">; +defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">; +defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">; +defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">; +defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">; +defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">; +defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">; defm V_CMP_F_I32 : VOPC_Real_gfx11<0x040>; -defm V_CMP_LT_I32 : VOPC_Real_gfx11<0x041>; -defm V_CMP_EQ_I32 : VOPC_Real_gfx11<0x042>; -defm V_CMP_LE_I32 : VOPC_Real_gfx11<0x043>; -defm V_CMP_GT_I32 : VOPC_Real_gfx11<0x044>; -defm V_CMP_NE_I32 : VOPC_Real_gfx11<0x045>; -defm V_CMP_GE_I32 : VOPC_Real_gfx11<0x046>; +defm V_CMP_LT_I32 : VOPC_Real_gfx11_gfx12<0x041>; +defm V_CMP_EQ_I32 : VOPC_Real_gfx11_gfx12<0x042>; +defm V_CMP_LE_I32 : VOPC_Real_gfx11_gfx12<0x043>; +defm V_CMP_GT_I32 : VOPC_Real_gfx11_gfx12<0x044>; +defm V_CMP_NE_I32 : VOPC_Real_gfx11_gfx12<0x045>; +defm V_CMP_GE_I32 : VOPC_Real_gfx11_gfx12<0x046>; defm V_CMP_T_I32 : VOPC_Real_gfx11<0x047>; defm V_CMP_F_U32 : VOPC_Real_gfx11<0x048>; -defm V_CMP_LT_U32 : VOPC_Real_gfx11<0x049>; -defm V_CMP_EQ_U32 : VOPC_Real_gfx11<0x04a>; -defm V_CMP_LE_U32 : VOPC_Real_gfx11<0x04b>; -defm V_CMP_GT_U32 : VOPC_Real_gfx11<0x04c>; -defm V_CMP_NE_U32 : VOPC_Real_gfx11<0x04d>; -defm V_CMP_GE_U32 : VOPC_Real_gfx11<0x04e>; +defm V_CMP_LT_U32 : VOPC_Real_gfx11_gfx12<0x049>; +defm V_CMP_EQ_U32 : VOPC_Real_gfx11_gfx12<0x04a>; +defm V_CMP_LE_U32 : VOPC_Real_gfx11_gfx12<0x04b>; +defm V_CMP_GT_U32 : VOPC_Real_gfx11_gfx12<0x04c>; +defm V_CMP_NE_U32 : VOPC_Real_gfx11_gfx12<0x04d>; +defm V_CMP_GE_U32 : VOPC_Real_gfx11_gfx12<0x04e>; defm V_CMP_T_U32 : VOPC_Real_gfx11<0x04f>; defm V_CMP_F_I64 : VOPC_Real_gfx11<0x050>; -defm V_CMP_LT_I64 : VOPC_Real_gfx11<0x051>; -defm V_CMP_EQ_I64 : VOPC_Real_gfx11<0x052>; -defm V_CMP_LE_I64 : VOPC_Real_gfx11<0x053>; -defm V_CMP_GT_I64 : VOPC_Real_gfx11<0x054>; -defm V_CMP_NE_I64 : VOPC_Real_gfx11<0x055>; -defm V_CMP_GE_I64 : VOPC_Real_gfx11<0x056>; +defm V_CMP_LT_I64 : VOPC_Real_gfx11_gfx12<0x051>; +defm V_CMP_EQ_I64 : VOPC_Real_gfx11_gfx12<0x052>; +defm V_CMP_LE_I64 : VOPC_Real_gfx11_gfx12<0x053>; +defm V_CMP_GT_I64 : VOPC_Real_gfx11_gfx12<0x054>; +defm V_CMP_NE_I64 : VOPC_Real_gfx11_gfx12<0x055>; +defm V_CMP_GE_I64 : VOPC_Real_gfx11_gfx12<0x056>; defm V_CMP_T_I64 : VOPC_Real_gfx11<0x057>; defm V_CMP_F_U64 : VOPC_Real_gfx11<0x058>; -defm V_CMP_LT_U64 : VOPC_Real_gfx11<0x059>; -defm V_CMP_EQ_U64 : VOPC_Real_gfx11<0x05a>; -defm V_CMP_LE_U64 : VOPC_Real_gfx11<0x05b>; -defm V_CMP_GT_U64 : VOPC_Real_gfx11<0x05c>; -defm V_CMP_NE_U64 : VOPC_Real_gfx11<0x05d>; -defm V_CMP_GE_U64 : VOPC_Real_gfx11<0x05e>; +defm V_CMP_LT_U64 : VOPC_Real_gfx11_gfx12<0x059>; +defm V_CMP_EQ_U64 : VOPC_Real_gfx11_gfx12<0x05a>; +defm V_CMP_LE_U64 : VOPC_Real_gfx11_gfx12<0x05b>; +defm V_CMP_GT_U64 : VOPC_Real_gfx11_gfx12<0x05c>; +defm V_CMP_NE_U64 : VOPC_Real_gfx11_gfx12<0x05d>; +defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>; defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; -defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11<0x07d, "v_cmp_class_f16">; -defm V_CMP_CLASS_F32 : VOPC_Real_gfx11<0x07e>; -defm V_CMP_CLASS_F64 : VOPC_Real_gfx11<0x07f>; +defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">; +defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>; +defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>; defm V_CMPX_F_F16_t16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">; -defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11<0x081, "v_cmpx_lt_f16">; -defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11<0x082, "v_cmpx_eq_f16">; -defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11<0x083, "v_cmpx_le_f16">; -defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11<0x084, "v_cmpx_gt_f16">; -defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11<0x085, "v_cmpx_lg_f16">; -defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11<0x086, "v_cmpx_ge_f16">; -defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11<0x087, "v_cmpx_o_f16">; -defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11<0x088, "v_cmpx_u_f16">; -defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11<0x089, "v_cmpx_nge_f16">; -defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11<0x08a, "v_cmpx_nlg_f16">; -defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11<0x08b, "v_cmpx_ngt_f16">; -defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11<0x08c, "v_cmpx_nle_f16">; -defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11<0x08d, "v_cmpx_neq_f16">; -defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11<0x08e, "v_cmpx_nlt_f16">; +defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">; +defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">; +defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">; +defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">; +defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">; +defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">; +defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">; +defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">; +defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">; defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">; defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>; -defm V_CMPX_LT_F32 : VOPCX_Real_gfx11<0x091>; -defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11<0x092>; -defm V_CMPX_LE_F32 : VOPCX_Real_gfx11<0x093>; -defm V_CMPX_GT_F32 : VOPCX_Real_gfx11<0x094>; -defm V_CMPX_LG_F32 : VOPCX_Real_gfx11<0x095>; -defm V_CMPX_GE_F32 : VOPCX_Real_gfx11<0x096>; -defm V_CMPX_O_F32 : VOPCX_Real_gfx11<0x097>; -defm V_CMPX_U_F32 : VOPCX_Real_gfx11<0x098>; -defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11<0x099>; -defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11<0x09a>; -defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11<0x09b>; -defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11<0x09c>; -defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11<0x09d>; -defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11<0x09e>; +defm V_CMPX_LT_F32 : VOPCX_Real_gfx11_gfx12<0x091>; +defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11_gfx12<0x092>; +defm V_CMPX_LE_F32 : VOPCX_Real_gfx11_gfx12<0x093>; +defm V_CMPX_GT_F32 : VOPCX_Real_gfx11_gfx12<0x094>; +defm V_CMPX_LG_F32 : VOPCX_Real_gfx11_gfx12<0x095>; +defm V_CMPX_GE_F32 : VOPCX_Real_gfx11_gfx12<0x096>; +defm V_CMPX_O_F32 : VOPCX_Real_gfx11_gfx12<0x097>; +defm V_CMPX_U_F32 : VOPCX_Real_gfx11_gfx12<0x098>; +defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11_gfx12<0x099>; +defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11_gfx12<0x09a>; +defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11_gfx12<0x09b>; +defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11_gfx12<0x09c>; +defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11_gfx12<0x09d>; +defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11_gfx12<0x09e>; defm V_CMPX_T_F32 : VOPCX_Real_with_name_gfx11<0x09f, "V_CMPX_TRU_F32", "v_cmpx_t_f32">; defm V_CMPX_F_F64 : VOPCX_Real_gfx11<0x0a0>; -defm V_CMPX_LT_F64 : VOPCX_Real_gfx11<0x0a1>; -defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11<0x0a2>; -defm V_CMPX_LE_F64 : VOPCX_Real_gfx11<0x0a3>; -defm V_CMPX_GT_F64 : VOPCX_Real_gfx11<0x0a4>; -defm V_CMPX_LG_F64 : VOPCX_Real_gfx11<0x0a5>; -defm V_CMPX_GE_F64 : VOPCX_Real_gfx11<0x0a6>; -defm V_CMPX_O_F64 : VOPCX_Real_gfx11<0x0a7>; -defm V_CMPX_U_F64 : VOPCX_Real_gfx11<0x0a8>; -defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11<0x0a9>; -defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11<0x0aa>; -defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11<0x0ab>; -defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11<0x0ac>; -defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11<0x0ad>; -defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11<0x0ae>; +defm V_CMPX_LT_F64 : VOPCX_Real_gfx11_gfx12<0x0a1>; +defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11_gfx12<0x0a2>; +defm V_CMPX_LE_F64 : VOPCX_Real_gfx11_gfx12<0x0a3>; +defm V_CMPX_GT_F64 : VOPCX_Real_gfx11_gfx12<0x0a4>; +defm V_CMPX_LG_F64 : VOPCX_Real_gfx11_gfx12<0x0a5>; +defm V_CMPX_GE_F64 : VOPCX_Real_gfx11_gfx12<0x0a6>; +defm V_CMPX_O_F64 : VOPCX_Real_gfx11_gfx12<0x0a7>; +defm V_CMPX_U_F64 : VOPCX_Real_gfx11_gfx12<0x0a8>; +defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11_gfx12<0x0a9>; +defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11_gfx12<0x0aa>; +defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11_gfx12<0x0ab>; +defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11_gfx12<0x0ac>; +defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>; +defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>; defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; -defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11<0x0b1, "v_cmpx_lt_i16">; -defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11<0x0b2, "v_cmpx_eq_i16">; -defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b3, "v_cmpx_le_i16">; -defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11<0x0b4, "v_cmpx_gt_i16">; -defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b5, "v_cmpx_ne_i16">; -defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b6, "v_cmpx_ge_i16">; -defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11<0x0b9, "v_cmpx_lt_u16">; -defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11<0x0ba, "v_cmpx_eq_u16">; -defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11<0x0bb, "v_cmpx_le_u16">; -defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11<0x0bc, "v_cmpx_gt_u16">; -defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11<0x0bd, "v_cmpx_ne_u16">; -defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11<0x0be, "v_cmpx_ge_u16">; +defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">; +defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">; +defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">; +defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">; +defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">; +defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">; +defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">; +defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">; +defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">; +defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">; +defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">; +defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">; defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>; -defm V_CMPX_LT_I32 : VOPCX_Real_gfx11<0x0c1>; -defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11<0x0c2>; -defm V_CMPX_LE_I32 : VOPCX_Real_gfx11<0x0c3>; -defm V_CMPX_GT_I32 : VOPCX_Real_gfx11<0x0c4>; -defm V_CMPX_NE_I32 : VOPCX_Real_gfx11<0x0c5>; -defm V_CMPX_GE_I32 : VOPCX_Real_gfx11<0x0c6>; +defm V_CMPX_LT_I32 : VOPCX_Real_gfx11_gfx12<0x0c1>; +defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11_gfx12<0x0c2>; +defm V_CMPX_LE_I32 : VOPCX_Real_gfx11_gfx12<0x0c3>; +defm V_CMPX_GT_I32 : VOPCX_Real_gfx11_gfx12<0x0c4>; +defm V_CMPX_NE_I32 : VOPCX_Real_gfx11_gfx12<0x0c5>; +defm V_CMPX_GE_I32 : VOPCX_Real_gfx11_gfx12<0x0c6>; defm V_CMPX_T_I32 : VOPCX_Real_gfx11<0x0c7>; defm V_CMPX_F_U32 : VOPCX_Real_gfx11<0x0c8>; -defm V_CMPX_LT_U32 : VOPCX_Real_gfx11<0x0c9>; -defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11<0x0ca>; -defm V_CMPX_LE_U32 : VOPCX_Real_gfx11<0x0cb>; -defm V_CMPX_GT_U32 : VOPCX_Real_gfx11<0x0cc>; -defm V_CMPX_NE_U32 : VOPCX_Real_gfx11<0x0cd>; -defm V_CMPX_GE_U32 : VOPCX_Real_gfx11<0x0ce>; +defm V_CMPX_LT_U32 : VOPCX_Real_gfx11_gfx12<0x0c9>; +defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11_gfx12<0x0ca>; +defm V_CMPX_LE_U32 : VOPCX_Real_gfx11_gfx12<0x0cb>; +defm V_CMPX_GT_U32 : VOPCX_Real_gfx11_gfx12<0x0cc>; +defm V_CMPX_NE_U32 : VOPCX_Real_gfx11_gfx12<0x0cd>; +defm V_CMPX_GE_U32 : VOPCX_Real_gfx11_gfx12<0x0ce>; defm V_CMPX_T_U32 : VOPCX_Real_gfx11<0x0cf>; defm V_CMPX_F_I64 : VOPCX_Real_gfx11<0x0d0>; -defm V_CMPX_LT_I64 : VOPCX_Real_gfx11<0x0d1>; -defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11<0x0d2>; -defm V_CMPX_LE_I64 : VOPCX_Real_gfx11<0x0d3>; -defm V_CMPX_GT_I64 : VOPCX_Real_gfx11<0x0d4>; -defm V_CMPX_NE_I64 : VOPCX_Real_gfx11<0x0d5>; -defm V_CMPX_GE_I64 : VOPCX_Real_gfx11<0x0d6>; +defm V_CMPX_LT_I64 : VOPCX_Real_gfx11_gfx12<0x0d1>; +defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11_gfx12<0x0d2>; +defm V_CMPX_LE_I64 : VOPCX_Real_gfx11_gfx12<0x0d3>; +defm V_CMPX_GT_I64 : VOPCX_Real_gfx11_gfx12<0x0d4>; +defm V_CMPX_NE_I64 : VOPCX_Real_gfx11_gfx12<0x0d5>; +defm V_CMPX_GE_I64 : VOPCX_Real_gfx11_gfx12<0x0d6>; defm V_CMPX_T_I64 : VOPCX_Real_gfx11<0x0d7>; defm V_CMPX_F_U64 : VOPCX_Real_gfx11<0x0d8>; -defm V_CMPX_LT_U64 : VOPCX_Real_gfx11<0x0d9>; -defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11<0x0da>; -defm V_CMPX_LE_U64 : VOPCX_Real_gfx11<0x0db>; -defm V_CMPX_GT_U64 : VOPCX_Real_gfx11<0x0dc>; -defm V_CMPX_NE_U64 : VOPCX_Real_gfx11<0x0dd>; -defm V_CMPX_GE_U64 : VOPCX_Real_gfx11<0x0de>; +defm V_CMPX_LT_U64 : VOPCX_Real_gfx11_gfx12<0x0d9>; +defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11_gfx12<0x0da>; +defm V_CMPX_LE_U64 : VOPCX_Real_gfx11_gfx12<0x0db>; +defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>; +defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>; +defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>; defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; -defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11<0x0fd, "v_cmpx_class_f16">; -defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11<0x0fe>; -defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11<0x0ff>; +defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">; +defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>; +defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>; //===----------------------------------------------------------------------===// // GFX10. @@ -1968,10 +2016,13 @@ multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> : VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>; multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> : - VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_gfx11<op>; + VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_Base<GFX11Gen, op>; multiclass VOPCX_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> : - VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real_gfx11<op>; + VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real<GFX11Gen, op>; + +multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<9> op> : + VOPC_Real_gfx6_gfx7_gfx10_gfx11<op>, VOPC_Real_Base<GFX12Gen, op>; defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>; defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>; @@ -2006,20 +2057,20 @@ defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>; defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>; defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>; defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x020>; -defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x021>; -defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x022>; -defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x023>; -defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x024>; -defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x025>; -defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x026>; -defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x027>; -defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x028>; -defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x029>; -defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02a>; -defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; -defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02c>; -defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02d>; -defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02e>; +defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x021>; +defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x022>; +defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x023>; +defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x024>; +defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x025>; +defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x026>; +defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x027>; +defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x028>; +defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x029>; +defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02a>; +defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>; +defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02c>; +defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02d>; +defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02e>; defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>; defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>; defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td index eb2e9f04022e..c6af3d67c560 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td @@ -54,23 +54,34 @@ class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 { // VOPD classes //===----------------------------------------------------------------------===// + +class GFXGenD<GFXGen Gen, list<string> DXPseudos, list<string> DYPseudos, + Predicate subtargetPred = Gen.AssemblerPredicate> : + GFXGen<Gen.AssemblerPredicate, Gen.DecoderNamespace, Gen.Suffix, + Gen.Subtarget> { + list<string> VOPDXPseudos = DXPseudos; + list<string> VOPDYPseudos = DYPseudos; + Predicate SubtargetPredicate = subtargetPred; +} + class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, - VOPD_Component XasVC, VOPD_Component YasVC> + VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen> : VOPAnyCommon<outs, ins, asm, []>, VOP<NAME>, - SIMCInstr<NAME, SIEncodingFamily.GFX11> { + SIMCInstr<NAME, Gen.Subtarget> { // Fields for table indexing Instruction Opcode = !cast<Instruction>(NAME); bits<5> OpX = XasVC.VOPDOp; bits<5> OpY = YasVC.VOPDOp; + bits<4> SubTgt = Gen.Subtarget; let VALU = 1; - let DecoderNamespace = "GFX11"; - let AssemblerPredicate = isGFX11Plus; + let DecoderNamespace = Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; let WaveSizePredicate = isWave32; let isCodeGenOnly = 0; - let SubtargetPredicate = isGFX11Plus; + let SubtargetPredicate = Gen.SubtargetPredicate; let AsmMatchConverter = "cvtVOPD"; let Size = 8; let ReadsModeReg = !or(VDX.ReadsModeReg, VDY.ReadsModeReg); @@ -97,77 +108,103 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, } class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, - VOPD_Component XasVC, VOPD_Component YasVC> - : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>, + VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen> + : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>, VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> { let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X); let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y); } class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, - VOPD_Component XasVC, VOPD_Component YasVC> - : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>, + VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen> + : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>, VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> { let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X); let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y); let Size = 12; + let FixedSize = 1; } // V_DUAL_DOT2ACC_F32_BF16 is a legal instruction, but V_DOT2ACC_F32_BF16 is -// not. Since we generate the DUAL form by converting from the normal form we -// will never generate it. -defvar VOPDYPseudos = [ +// not. V_DUAL_DOT2C_F32_BF16 is a legal instruction on GFX12, but +// V_DOT2C_F32_F16_e32 is not. Since we generate the DUAL form by converting +// from the normal form we will never generate them. +defvar VOPDPseudosCommon = [ "V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32", "V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32", - "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32", - "V_DOT2C_F32_F16_e32", "V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_AND_B32_e32" + "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32" ]; -defvar VOPDXPseudos = VOPDYPseudos[0...VOPDX_Max_Index]; +defvar VOPDPseudosGFX11 = ["V_DOT2C_F32_F16_e32"]; +defvar VOPDYOnlyPseudosCommon = ["V_ADD_U32_e32", "V_LSHLREV_B32_e32", + "V_AND_B32_e32"]; + +defvar VOPDXPseudosGFX11 = !listconcat(VOPDPseudosCommon, VOPDPseudosGFX11); +defvar VOPDXPseudosGFX12 = VOPDPseudosCommon; +defvar VOPDYPseudosGFX11 = !listconcat(VOPDXPseudosGFX11, VOPDYOnlyPseudosCommon); +defvar VOPDYPseudosGFX12 = !listconcat(VOPDXPseudosGFX12, VOPDYOnlyPseudosCommon); + +def GFX11GenD : GFXGenD<GFX11Gen, VOPDXPseudosGFX11, VOPDYPseudosGFX11>; +def GFX12GenD : GFXGenD<GFX12Gen, VOPDXPseudosGFX12, VOPDYPseudosGFX12>; + def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> { let DecoderMethod = "decodeOperandVOPDDstY"; } -foreach x = VOPDXPseudos in { - foreach y = VOPDYPseudos in { - defvar xInst = !cast<VOP_Pseudo>(x); - defvar yInst = !cast<VOP_Pseudo>(y); - defvar XasVC = !cast<VOPD_Component>(x); - defvar YasVC = !cast<VOPD_Component>(y); - defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"), - !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); - // If X or Y is MADK (have a mandatory immediate), all src operands which - // may contain an optional literal must use the VSrc_*_Deferred operand - // type. Optional literal operands in MADK VOPD components always use this - // operand form. If Both X and Y are MADK, the mandatory literal of X - // additionally must use an alternate operand format which defers to the - // 'real' Y literal - defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32")); - defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); - defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2); - defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY); - if !or(isOpXMADK, isOpYMADK) then { - if !and(isOpXMADK, isOpYMADK) then { - defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl); - defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY); - defvar asm = XasVC.VOPDName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; - def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; - } else { - defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; - if isOpXMADK then { - assert !not(isOpYMADK), "Expected only OpX as MADK"; - defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred); - def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; - } else { - assert !not(isOpXMADK), "Expected only OpY as MADK"; +class getRenamed<string VOPDName, GFXGen Gen> { + string ret = !if(!eq(Gen.Subtarget, GFX12Gen.Subtarget), + !if(!eq(VOPDName, "v_dual_max_f32"), + "v_dual_max_num_f32", + !if(!eq(VOPDName, "v_dual_min_f32"), + "v_dual_min_num_f32", + VOPDName)), + VOPDName); +} + +foreach Gen = [GFX11GenD, GFX12GenD] in { + foreach x = Gen.VOPDXPseudos in { + foreach y = Gen.VOPDYPseudos in { + defvar xInst = !cast<VOP_Pseudo>(x); + defvar yInst = !cast<VOP_Pseudo>(y); + defvar XasVC = !cast<VOPD_Component>(x); + defvar YasVC = !cast<VOPD_Component>(y); + defvar xAsmName = getRenamed<XasVC.VOPDName, Gen>.ret; + defvar yAsmName = getRenamed<YasVC.VOPDName, Gen>.ret; + defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"), + !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); + // If X or Y is MADK (have a mandatory immediate), all src operands which + // may contain an optional literal must use the VSrc_*_Deferred operand + // type. Optional literal operands in MADK VOPD components always use this + // operand form. If Both X and Y are MADK, the mandatory literal of X + // additionally must use an alternate operand format which defers to the + // 'real' Y literal + defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32")); + defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); + defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2) # Gen.Suffix; + defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY); + if !or(isOpXMADK, isOpYMADK) then { + if !and(isOpXMADK, isOpYMADK) then { + defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl); defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY); - def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; + defvar asm = xAsmName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY; + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>; + } else { + defvar asm = xAsmName #" "# xInst.Pfl.AsmVOPDX #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY; + if isOpXMADK then { + assert !not(isOpYMADK), "Expected only OpX as MADK"; + defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred); + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>; + } else { + assert !not(isOpXMADK), "Expected only OpY as MADK"; + defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY); + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>; + } } + } else { + defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY); + defvar asm = xAsmName #" "# xInst.Pfl.AsmVOPDX #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY; + def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>; } - } else { - defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY); - defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; - def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC>; } } } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td index 3755daf4f9b1..fd4626d902ac 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -29,6 +29,22 @@ class LetDummies { string DecoderNamespace; } +//===----------------------------------------------------------------------===// +// VOP Subtarget info +//===----------------------------------------------------------------------===// + +class GFXGen<Predicate pred, string dn, string suffix, int sub> { + Predicate AssemblerPredicate = pred; + string DecoderNamespace = dn; + string Suffix = suffix; + int Subtarget = sub; +} + +def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>; +def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>; + +//===----------------------------------------------------------------------===// + class VOP <string opName> { string OpName = opName; } @@ -190,6 +206,14 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni VOPProfile Pfl = ps.Pfl; } +class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> : + VOP3_Real <ps, Gen.Subtarget, asm_name> { + let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate); + let DecoderNamespace = Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); +} + // XXX - Is there any reason to distinguish this from regular VOP3 // here? class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : @@ -199,6 +223,12 @@ class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints); } +class VOP3P_Real_Gen<VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> : + VOP3P_Real<ps, Gen.Subtarget, asm_name> { + let AssemblerPredicate = Gen.AssemblerPredicate; + let DecoderNamespace = Gen.DecoderNamespace; +} + class VOP3a<VOPProfile P> : Enc64 { bits<4> src0_modifiers; bits<9> src0; @@ -234,7 +264,7 @@ class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> { let Inst{31-26} = 0x35; } -class VOP3a_gfx11<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>; +class VOP3a_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>; class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> { let Inst{25-16} = op; @@ -251,7 +281,7 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> { let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); } -class VOP3e_gfx11<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>; +class VOP3e_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>; class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> { bits<8> vdst; @@ -272,9 +302,9 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); } -class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>; +class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>; -class VOP3DotOpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11<op, p>{ +class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{ let Inst{11} = ?; let Inst{12} = ?; } @@ -435,7 +465,7 @@ class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> { let Inst{31-23} = 0x198; //encoding } -class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>; +class VOP3Pe_gfx11_gfx12<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>; class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> { let Inst{25-17} = op; @@ -448,7 +478,7 @@ class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> { let Inst{31-26} = 0x35; } -class VOP3be_gfx11<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>; +class VOP3be_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>; class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> { bits<1> clamp; @@ -791,8 +821,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], string AsmOperands = asmOps; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); - let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let SubtargetPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -862,8 +892,8 @@ class VOP_DPP_Base <string OpName, VOPProfile P, let Size = 8; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); - let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let SubtargetPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -1273,6 +1303,19 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f } // end SubtargetPredicate = isGFX11Plus } +class UniformUnaryFragOrOp<SDPatternOperator Op> { + SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)), + UniformUnaryFrag<Op>, Op); +} + +multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P, + SDPatternOperator node = null_frag> { + def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst, + (UniformUnaryFragOrOp<node>.ret + (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, + i32:$omod))))]>; +} + //===----------------------------------------------------------------------===// // VOP3 DPP //===----------------------------------------------------------------------===// @@ -1294,6 +1337,15 @@ class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget, string opName = ps.OpName> : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>; +class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen, + string opName = ps.OpName> : + VOP3_DPP16 <op, ps, Gen.Subtarget, opName> { + let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate); + let DecoderNamespace = "DPP"#Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"); +} + class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP8<op, opName, ps.Pfl> { let VOP3_OPSEL = ps.Pfl.HasOpSel; @@ -1320,164 +1372,240 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> } //===----------------------------------------------------------------------===// -// VOP3 GFX11 +// VOP3 GFX11, GFX12 //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX11Only, - DecoderNamespace = "GFX11" in { - multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME, - bit isSingle = 0> { - defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); - let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3OpSel_gfx11<op, ps.Pfl>; - if !not(ps.Pfl.HasOpSel) then - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3e_gfx11<op, ps.Pfl>; - } - } - multiclass VOP3Dot_Real_Base_gfx11<bits<10> op, string opName = NAME, - bit isSingle = 0> { - defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); - let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3DotOpSel_gfx11<op, ps.Pfl>; - } - } - multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName, - string asmName, bit isSingle = 0> { - defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); - let AsmString = asmName # ps.AsmOperands, - IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3OpSel_gfx11<op, ps.Pfl>; - if !not(ps.Pfl.HasOpSel) then - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3e_gfx11<op, ps.Pfl>; - } - def _gfx11_VOP3_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>, LetDummies; - } - // for READLANE/WRITELANE - multiclass VOP3_Real_No_Suffix_gfx11<bits<10> op, string opName = NAME> { - defvar ps = !cast<VOP_Pseudo>(opName); - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11>, - VOP3e_gfx11<op, ps.Pfl>; - } - multiclass VOP3_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> { - def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> { - let DecoderNamespace = "DPPGFX11"; - } +multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME, + bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3OpSel_gfx11_gfx12<op, ps.Pfl>; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3e_gfx11_gfx12<op, ps.Pfl>; } +} - multiclass VOP3Dot_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> { - def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> { - let Inst{11} = ?; - let Inst{12} = ?; - let DecoderNamespace = "DPPGFX11"; - } +multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME, + bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3DotOpSel_gfx11_gfx12<op, ps.Pfl>; } +} - multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName, - string asmName> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - let AsmString = asmName # ps.Pfl.AsmVOP3DPP16, DecoderNamespace = "DPPGFX11" in { - defm NAME : VOP3_Real_dpp_Base_gfx11<op, opName>; - } - } - multiclass VOP3_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> { - let DecoderNamespace = "DPP8GFX11"; - } +multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName, + string asmName, bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands, + IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3OpSel_gfx11_gfx12<op, ps.Pfl>; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3e_gfx11_gfx12<op, ps.Pfl>; } + def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies; +} + +// for READLANE/WRITELANE +multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP_Pseudo>(opName); + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen>, + VOP3e_gfx11_gfx12<op, ps.Pfl>; +} + +multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> { + def _e64_dpp#Gen.Suffix : + VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen>; +} - multiclass VOP3Dot_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> { +multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> { + def _e64_dpp#Gen.Suffix : + VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen> { let Inst{11} = ?; let Inst{12} = ?; - let DecoderNamespace = "DPP8GFX11"; } +} + +multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.Pfl.AsmVOP3DPP16 in { + defm NAME : VOP3_Real_dpp_Base<Gen, op, opName>; } +} - multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName, - string asmName> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, DecoderNamespace = "DPP8GFX11" in { - defm NAME : VOP3_Real_dpp8_Base_gfx11<op, opName>; - } +multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; } - multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName, - bit isSingle = 0> { - defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); - let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in - def _e64_gfx11 : - VOP3_Real<ps, SIEncodingFamily.GFX11, asmName>, - VOP3be_gfx11<op, ps.Pfl> ; +} + +multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> { + let Inst{11} = ?; + let Inst{12} = ?; + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; } - multiclass VOP3be_Real_dpp_gfx11<bits<10> op, string opName, string asmName> { - defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); - defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp"); - def _e64_dpp_gfx11 : Base_VOP3b_DPP16<op, dpp_ps, asmName>, - SIMCInstr<dpp_ps.PseudoInstr, SIEncodingFamily.GFX11> { - let DecoderNamespace = "DPPGFX11"; - } +} + +multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, + DecoderNamespace = "DPP8"#Gen.DecoderNamespace# + !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"), + AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, + Gen.AssemblerPredicate) in { + + defm NAME : VOP3_Real_dpp8_Base<Gen, op, opName>; } - multiclass VOP3be_Real_dpp8_gfx11<bits<10> op, string opName, string asmName> { - defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); - def _e64_dpp8_gfx11 : VOP3b_DPP8_Base<op, ps, asmName> { - let DecoderNamespace = "DPP8GFX11"; - } +} + +multiclass VOP3be_Real<GFXGen Gen, bits<10> op, string opName, string asmName, + bit isSingle = 0> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in + def _e64#Gen.Suffix : + VOP3_Real_Gen<ps, Gen, asmName>, + VOP3be_gfx11_gfx12<op, ps.Pfl> ; +} + +multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); + defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp"); + def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>, + SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> { + let DecoderNamespace = "DPP"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; + } +} + +multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); + def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> { + let DecoderNamespace = "DPP8"#Gen.DecoderNamespace; + let AssemblerPredicate = Gen.AssemblerPredicate; } -} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" +} // VOP1 and VOP2 depend on these triple defs -multiclass VOP3_Realtriple_gfx11<bits<10> op, - bit isSingle = 0, string opName = NAME> : - VOP3_Real_Base_gfx11<op, opName, isSingle>, - VOP3_Real_dpp_Base_gfx11<op, opName>, - VOP3_Real_dpp8_Base_gfx11<op, opName>; - -multiclass VOP3Dot_Realtriple_gfx11<bits<10> op, - bit isSingle = 0, string opName = NAME> : - VOP3Dot_Real_Base_gfx11<op, opName, isSingle>, - VOP3Dot_Real_dpp_Base_gfx11<op, opName>, - VOP3Dot_Real_dpp8_Base_gfx11<op, opName>; - -multiclass VOP3Only_Realtriple_gfx11<bits<10> op> : - VOP3_Realtriple_gfx11<op, 1>; - -multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName, - string asmName, bit isSingle = 0> : - VOP3_Real_with_name_gfx11<op, opName, asmName, isSingle>, - VOP3_Real_dpp_with_name_gfx11<op, opName, asmName>, - VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>; +multiclass VOP3_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0, + string opName = NAME> : + VOP3_Real_Base<Gen, op, opName, isSingle>, + VOP3_Real_dpp_Base<Gen, op, opName>, + VOP3_Real_dpp8_Base<Gen, op, opName>; + +multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0, + string opName = NAME> : + VOP3Dot_Real_Base<Gen, op, opName, isSingle>, + VOP3Dot_Real_dpp_Base<Gen, op, opName>, + VOP3Dot_Real_dpp8_Base<Gen, op, opName>; + +multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> : + VOP3_Realtriple<Gen, op, 1>; + +multiclass VOP3_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName, + string asmName, bit isSingle = 0> : + VOP3_Real_with_name<Gen, op, opName, asmName, isSingle>, + VOP3_Real_dpp_with_name<Gen, op, opName, asmName>, + VOP3_Real_dpp8_with_name<Gen, op, opName, asmName>; + +multiclass VOP3Only_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName, + string asmName> : + VOP3_Realtriple_with_name<Gen, op, opName, asmName, 1>; + +multiclass VOP3Only_Realtriple_t16<GFXGen Gen, bits<10> op, string asmName, + string opName = NAME> + : VOP3Only_Realtriple_with_name<Gen, op, opName, asmName>; + +multiclass VOP3be_Realtriple< + GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME, + string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> : + VOP3be_Real<Gen, op, opName, asmName, isSingle>, + VOP3be_Real_dpp<Gen, op, opName, asmName>, + VOP3be_Real_dpp8<Gen, op, opName, asmName>; -multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName, - string asmName> : - VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>; +multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> : + VOP3be_Realtriple<Gen, op, 1>; + +//===----------------------------------------------------------------------===// +// VOP3 GFX11 +//===----------------------------------------------------------------------===// + +multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName, + bit isSingle = 0> : + VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>; + +multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME, + bit isSingle = 0> : + VOP3_Real_Base<GFX11Gen, op, opName, isSingle>; + +multiclass VOP3_Realtriple_gfx11<bits<10> op, bit isSingle = 0, + string opName = NAME> : + VOP3_Realtriple<GFX11Gen, op, isSingle, opName>; multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName = NAME> - : VOP3Only_Realtriple_with_name_gfx11<op, opName, asmName>; + : VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>; -multiclass VOP3be_Realtriple_gfx11< - bits<10> op, bit isSingle = 0, string opName = NAME, - string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> : - VOP3be_Real_gfx11<op, opName, asmName, isSingle>, - VOP3be_Real_dpp_gfx11<op, opName, asmName>, - VOP3be_Real_dpp8_gfx11<op, opName, asmName>; +//===----------------------------------------------------------------------===// +// VOP3 GFX12 +//===----------------------------------------------------------------------===// + +multiclass VOP3Only_Realtriple_gfx12<bits<10> op, bit isSingle = 0> : + VOP3_Realtriple<GFX12Gen, op, isSingle>; + +// IsSingle is captured from the vopprofile for these instructions, but the +// following alternative is more explicit +multiclass VOP3Only_Real_Base_gfx12<bits<10> op> : + VOP3_Real_Base<GFX12Gen, op, NAME, 1/*IsSingle*/>; -multiclass VOP3beOnly_Realtriple_gfx11<bits<10> op> : - VOP3be_Realtriple_gfx11<op, 1>; +multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> : + VOP3Only_Realtriple<GFX12Gen, op>; + +multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName, + string asmName, bit isSingle = 0> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands, + IsSingle = !or(isSingle, ps.Pfl.IsSingle) in + def _e64_gfx12 : + VOP3_Real_Gen<ps, GFX12Gen, asmName>, + VOP3be_gfx11_gfx12<op, ps.Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX12Only]>; +} + +multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName, + string asmName, bit isSingle = 0> : + VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, isSingle>; + +multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, + string asmName> : + VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>, + VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; + +multiclass VOP3Only_Realtriple_with_name_t16_gfx12<bits<10> op, string asmName, + string opName = NAME> + : VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>; + +//===----------------------------------------------------------------------===// include "VOPCInstructions.td" include "VOP1Instructions.td" |