diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
174 files changed, 21730 insertions, 7370 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 11cc1a01d248..c4680cbedadf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -91,10 +91,6 @@ ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; -ModulePass *createAMDGPUFixFunctionBitcastsPass(); -void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); -extern char &AMDGPUFixFunctionBitcastsID; - ModulePass *createAMDGPUCtorDtorLoweringPass(); void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &); extern char &AMDGPUCtorDtorLoweringID; @@ -303,6 +299,12 @@ extern char &SIMemoryLegalizerID; void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; +void initializeAMDGPUReleaseVGPRsPass(PassRegistry &); +extern char &AMDGPUReleaseVGPRsID; + +void initializeAMDGPUInsertDelayAluPass(PassRegistry &); +extern char &AMDGPUInsertDelayAluID; + void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; @@ -335,6 +337,9 @@ extern char &GCNNSAReassignID; void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; +FunctionPass *createAMDGPUSetWavePriorityPass(); +void initializeAMDGPUSetWavePriorityPass(PassRegistry &); + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 806c0b18637a..48b5814cd482 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -86,6 +86,12 @@ def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts" "Have s_scratch_* flat memory instructions" >; +def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", + "EnableFlatScratch", + "true", + "Use scratch_* flat memory instructions to access scratch" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -171,6 +177,12 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureUserSGPRInit16Bug : SubtargetFeature<"user-sgpr-init16-bug", + "UserSGPRInit16Bug", + "true", + "Bug requiring at least 16 user+system SGPRs to be enabled" +>; + def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "LDSMisalignedBug", "true", @@ -307,12 +319,24 @@ def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", "Additional instructions for GFX90A+" >; +def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", + "GFX940Insts", + "true", + "Additional instructions for GFX940+" +>; + def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "GFX10Insts", "true", "Additional instructions for GFX10+" >; +def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts", + "GFX11Insts", + "true", + "Additional instructions for GFX11+" +>; + def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts", "GFX10_3Insts", "true", @@ -343,6 +367,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts", "Has i16/f16 instructions" >; +def FeatureTrue16BitInsts : SubtargetFeature<"true16", + "HasTrue16BitInsts", + "true", + "True 16-bit operand instructions" +>; + def FeatureVOP3P : SubtargetFeature<"vop3p", "HasVOP3PInsts", "true", @@ -458,6 +488,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "Support NSA encoding for image instructions" >; +def FeatureImageInsts : SubtargetFeature<"image-insts", + "HasImageInsts", + "true", + "Support image instructions" +>; + def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts", "HasExtendedImageInsts", "true", @@ -536,6 +572,13 @@ def FeatureDot7Insts : SubtargetFeature<"dot7-insts", "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" >; +def FeatureDot8Insts : SubtargetFeature<"dot8-insts", + "HasDot8Insts", + "true", + "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16, " + "v_dot4_i32_iu8, v_dot8_i32_iu4 instructions" +>; + def FeatureMAIInsts : SubtargetFeature<"mai-insts", "HasMAIInsts", "true", @@ -548,11 +591,28 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; -def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", - "HasAtomicFaddInsts", +def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", + "HasAtomicFaddRtnInsts", "true", - "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " - "global_atomic_pk_add_f16 instructions", + "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that " + "return original value", + [FeatureFlatGlobalInsts] +>; + +def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", + "HasAtomicFaddNoRtnInsts", + "true", + "Has buffer_atomic_add_f32 and global_atomic_add_f32 instructions that " + "don't return original value", + [FeatureFlatGlobalInsts] +>; + +def FeatureAtomicPkFaddNoRtnInsts + : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts", + "HasAtomicPkFaddNoRtnInsts", + "true", + "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " + "don't return original value", [FeatureFlatGlobalInsts] >; @@ -632,6 +692,12 @@ class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature < def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>; def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>; +def FeatureVOPD : SubtargetFeature<"vopd", + "HasVOPDInsts", + "true", + "Has VOPD dual issue wave32 instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -762,7 +828,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange, FeatureExtendedImageInsts + FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts ] >; @@ -772,7 +838,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, + FeatureImageInsts ] >; @@ -787,7 +854,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess + FeatureUnalignedBufferAccess, FeatureImageInsts ] >; @@ -824,6 +891,25 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts + ] +>; + +def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", + "gfx11", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, Feature16BitInsts, + FeatureInv2PiInlineImm, FeatureApertureRegs, + FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, + FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts, + FeatureGFX11Insts, FeatureVOP3P, FeatureVOPD, FeatureTrue16BitInsts, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, + FeatureNoSdstCMPX, FeatureVscnt, + FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, + FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess ] >; @@ -910,6 +996,7 @@ def FeatureISAVersion9_0_0 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; @@ -919,6 +1006,7 @@ def FeatureISAVersion9_0_2 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; @@ -927,6 +1015,7 @@ def FeatureISAVersion9_0_4 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureFmaMixInsts, FeatureImageGather4D16Bug]>; @@ -938,6 +1027,7 @@ def FeatureISAVersion9_0_6 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, @@ -953,6 +1043,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, @@ -964,7 +1055,8 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeatureDot7Insts, FeatureMAIInsts, FeaturePkFmacF16Inst, - FeatureAtomicFaddInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicPkFaddNoRtnInsts, FeatureSupportsSRAMECC, FeatureMFMAInlineLiteralBug, FeatureImageGather4D16Bug]>; @@ -975,6 +1067,7 @@ def FeatureISAVersion9_0_9 : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; @@ -995,7 +1088,10 @@ def FeatureISAVersion9_0_A : FeatureSet< FeaturePackedFP32Ops, FeatureMAIInsts, FeaturePkFmacF16Inst, - FeatureAtomicFaddInsts, + FeatureAtomicFaddRtnInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicPkFaddNoRtnInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureSupportsSRAMECC, FeaturePackedTID, @@ -1007,9 +1103,36 @@ def FeatureISAVersion9_0_C : FeatureSet< FeatureLDSBankCount32, FeatureDsSrc2Insts, FeatureExtendedImageInsts, + FeatureImageInsts, FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; +def FeatureISAVersion9_4_0 : FeatureSet< + [FeatureGFX9, + FeatureGFX90AInsts, + FeatureGFX940Insts, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureDot7Insts, + Feature64BitDPP, + FeaturePackedFP32Ops, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddRtnInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureAtomicPkFaddNoRtnInsts, + FeatureSupportsSRAMECC, + FeaturePackedTID, + FeatureArchitectedFlatScratch, + FullRate64Ops]>; + // TODO: Organize more features into groups. def FeatureGroup { // Bugs present on gfx10.1. @@ -1124,6 +1247,33 @@ def FeatureISAVersion10_3_0 : FeatureSet< FeatureWavefrontSize32, FeatureShaderCyclesRegister]>; +def FeatureISAVersion11_Common : FeatureSet< + [FeatureGFX11, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot5Insts, + FeatureDot7Insts, + FeatureDot8Insts, + FeatureNSAEncoding, + FeatureNSAMaxSize5, + FeatureWavefrontSize32, + FeatureShaderCyclesRegister, + FeatureArchitectedFlatScratch, + FeatureAtomicFaddRtnInsts, + FeatureAtomicFaddNoRtnInsts, + FeatureImageInsts, + FeaturePackedTID, + FeatureVcmpxPermlaneHazard]>; + +// Features for GFX 11.0.0 and 11.0.1 +def FeatureISAVersion11_0 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureUserSGPRInit16Bug])>; + +def FeatureISAVersion11_0_2 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureUserSGPRInit16Bug])>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -1152,8 +1302,10 @@ def AMDGPUAsmVariants { int SDWA9_ID = 3; string DPP = "DPP"; int DPP_ID = 4; + string VOP3_DPP = "VOP3_DPP"; + int VOP3_DPP_ID = 5; string Disable = "Disable"; - int Disable_ID = 5; + int Disable_ID = 6; } def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -1176,12 +1328,16 @@ def SDWA9AsmParserVariant : AsmParserVariant { let Name = AMDGPUAsmVariants.SDWA9; } - def DPPAsmParserVariant : AsmParserVariant { let Variant = AMDGPUAsmVariants.DPP_ID; let Name = AMDGPUAsmVariants.DPP; } +def VOP3_DPPAsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.VOP3_DPP_ID; + let Name = AMDGPUAsmVariants.VOP3_DPP; +} + def AMDGPU : Target { // Pull in Instruction Info: let InstructionSet = AMDGPUInstrInfo; @@ -1190,7 +1346,8 @@ def AMDGPU : Target { VOP3AsmParserVariant, SDWAAsmParserVariant, SDWA9AsmParserVariant, - DPPAsmParserVariant]; + DPPAsmParserVariant, + VOP3_DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; let AllowRegisterRenaming = 1; } @@ -1216,6 +1373,12 @@ def isGFX6GFX7GFX10 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX11Insts))>; + +def isGFX6GFX7GFX10Plus : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>; def isGFX7Only : @@ -1225,6 +1388,12 @@ def isGFX7Only : def isGFX7GFX10 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX11Insts))>; + +def isGFX7GFX10GFX11 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>; def isGFX7GFX8GFX9 : @@ -1248,6 +1417,21 @@ def isGFX6GFX7GFX8GFX9NotGFX90A : " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>; +def isGFX6GFX7GFX8GFX9GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of (not FeatureGFX11Insts))>; + +def isGFX7GFX8GFX9GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of FeatureCIInsts, (not FeatureGFX11Insts))>; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<(all_of FeatureCIInsts)>; @@ -1287,18 +1471,37 @@ def isGFX8GFX9NotGFX90A : AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; def isGFX90AOnly : - Predicate<"Subtarget->hasGFX90AInsts()">, - AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>; def isGFX908orGFX90A : - Predicate<"Subtarget->hasMAIInsts()">, - AssemblerPredicate<(all_of FeatureMAIInsts)>; + Predicate<"Subtarget->hasMAIInsts() && !Subtarget->hasGFX940Insts()">, + AssemblerPredicate<(all_of FeatureMAIInsts, (not FeatureGFX940Insts))>; + +def isGFX940Plus : + Predicate<"Subtarget->hasGFX940Insts()">, + AssemblerPredicate<(all_of FeatureGFX940Insts)>; + +def isGFX940GFX11Plus : + Predicate<"Subtarget->hasGFX940Insts() ||" + "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; + +def isGFX8GFX9NotGFX940 : + Predicate<"!Subtarget->hasGFX940Insts() &&" + "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, + AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX940Insts))>; def isGFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>; +def isGFX10Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of FeatureGFX10Insts, (not FeatureGFX11Insts))>; + def isGFX10Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of FeatureGFX10Insts)>; @@ -1308,6 +1511,25 @@ def isGFX10Before1030 : "!Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(all_of FeatureGFX10Insts,(not FeatureGFX10_3Insts))>; +def isGFX9GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX11Insts))>; + +def isGFX8GFX9GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, + AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX11Insts))>; + +def isGFX11Only : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(all_of FeatureGFX11Insts)>; + +def isGFX11Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">, + AssemblerPredicate<(all_of FeatureGFX11Insts)>; + def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; @@ -1321,7 +1543,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, - AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; + AssemblerPredicate<(any_of FeatureGFX10_3Insts, FeatureGFX940Insts)>; +def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, + AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; @@ -1354,6 +1578,11 @@ def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<(all_of Feature16BitInsts)>; + +def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">, + AssemblerPredicate<(all_of FeatureTrue16BitInsts)>; +def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">; + def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; @@ -1385,7 +1614,10 @@ def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">, def HasFmaakFmamkF32Insts : Predicate<"Subtarget->hasFmaakFmamkF32Insts()">, - AssemblerPredicate<(any_of FeatureGFX10Insts)>; + AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>; + +def HasImageInsts : Predicate<"Subtarget->hasImageInsts()">, + AssemblerPredicate<(all_of FeatureImageInsts)>; def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">, AssemblerPredicate<(all_of FeatureExtendedImageInsts)>; @@ -1454,6 +1686,9 @@ def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">, AssemblerPredicate<(all_of FeatureDot7Insts)>; +def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">, + AssemblerPredicate<(all_of FeatureDot8Insts)>; + def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; @@ -1478,8 +1713,13 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; -def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, - AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>; +def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>; +def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>; +def HasAtomicPkFaddNoRtnInsts + : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">, + AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>; def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index bebf032b5535..74be0336851c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,12 +14,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SmallSet.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-annotate-uniform" @@ -33,8 +32,18 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, LegacyDivergenceAnalysis *DA; MemorySSA *MSSA; AliasAnalysis *AA; - DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isEntryFunc; + bool Changed; + + void setUniformMetadata(Instruction *I) { + I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); + Changed = true; + } + + void setNoClobberMetadata(Instruction *I) { + I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); + Changed = true; + } public: static char ID; @@ -54,7 +63,6 @@ public: void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); - bool isClobberedInFunction(LoadInst * Load); }; } // End anonymous namespace @@ -69,88 +77,6 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, char AMDGPUAnnotateUniformValues::ID = 0; -static void setUniformMetadata(Instruction *I) { - I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); -} -static void setNoClobberMetadata(Instruction *I) { - I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); -} - -bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) { - MemorySSAWalker *Walker = MSSA->getWalker(); - SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; - SmallSet<MemoryAccess *, 8> Visited; - MemoryLocation Loc(MemoryLocation::get(Load)); - - const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool { - Instruction *DefInst = Def->getMemoryInst(); - LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n'); - - if (isa<FenceInst>(DefInst)) - return false; - - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_s_barrier: - case Intrinsic::amdgcn_wave_barrier: - return false; - default: - break; - } - } - - // Ignore atomics not aliasing with the original load, any atomic is a - // universal MemoryDef from MSSA's point of view too, just like a fence. - const auto checkNoAlias = [this, Load](auto I) -> bool { - return I && AA->isNoAlias(I->getPointerOperand(), - Load->getPointerOperand()); - }; - - if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || - checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) - return false; - - return true; - }; - - LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); - - // Start with a nearest dominating clobbering access, it will be either - // live on entry (nothing to do, load is not clobbered), MemoryDef, or - // MemoryPhi if several MemoryDefs can define this memory state. In that - // case add all Defs to WorkList and continue going up and checking all - // the definitions of this memory location until the root. When all the - // defs are exhausted and came to the entry state we have no clobber. - // Along the scan ignore barriers and fences which are considered clobbers - // by the MemorySSA, but not really writing anything into the memory. - while (!WorkList.empty()) { - MemoryAccess *MA = WorkList.pop_back_val(); - if (!Visited.insert(MA).second) - continue; - - if (MSSA->isLiveOnEntryDef(MA)) - continue; - - if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { - if (isReallyAClobber(Def)) { - LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); - return true; - } - - WorkList.push_back( - Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); - continue; - } - - const MemoryPhi *Phi = cast<MemoryPhi>(MA); - for (auto &Use : Phi->incoming_values()) - WorkList.push_back(cast<MemoryAccess>(&Use)); - } - - LLVM_DEBUG(dbgs() << " -> no clobber\n"); - return false; -} - void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { if (DA->isUniform(&I)) setUniformMetadata(&I); @@ -160,46 +86,18 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; + Instruction *PtrI = dyn_cast<Instruction>(Ptr); + if (PtrI) + setUniformMetadata(PtrI); + // We're tracking up to the Function boundaries, and cannot go beyond because // of FunctionPass restrictions. We can ensure that is memory not clobbered // for memory operations that are live in to entry points only. - Instruction *PtrI = dyn_cast<Instruction>(Ptr); - - if (!isEntryFunc) { - if (PtrI) - setUniformMetadata(PtrI); + if (!isEntryFunc) return; - } - - bool NotClobbered = false; bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; - if (PtrI) - NotClobbered = GlobalLoad && !isClobberedInFunction(&I); - else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) { - if (GlobalLoad && !isClobberedInFunction(&I)) { - NotClobbered = true; - // Lookup for the existing GEP - if (noClobberClones.count(Ptr)) { - PtrI = noClobberClones[Ptr]; - } else { - // Create GEP of the Value - Function *F = I.getParent()->getParent(); - Value *Idx = Constant::getIntegerValue( - Type::getInt32Ty(Ptr->getContext()), APInt(64, 0)); - // Insert GEP at the entry to make it dominate all uses - PtrI = GetElementPtrInst::Create(I.getType(), Ptr, - ArrayRef<Value *>(Idx), Twine(""), - F->getEntryBlock().getFirstNonPHI()); - } - I.replaceUsesOfWith(Ptr, PtrI); - } - } - - if (PtrI) { - setUniformMetadata(PtrI); - if (NotClobbered) - setNoClobberMetadata(PtrI); - } + if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA)) + setNoClobberMetadata(&I); } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { @@ -215,9 +113,9 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); + Changed = false; visit(F); - noClobberClones.clear(); - return true; + return Changed; } FunctionPass * diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 6e2984f2a04f..57a4660bc1eb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -27,6 +27,8 @@ #include "SIMachineFunctionInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -34,6 +36,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" @@ -111,6 +114,12 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { + IsTargetStreamerInitialized = false; +} + +void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { + IsTargetStreamerInitialized = true; + // TODO: Which one is called first, emitStartOfAsmFile or // emitFunctionBodyStart? if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) @@ -143,6 +152,10 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { } void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { + // Init target streamer if it has not yet happened + if (!IsTargetStreamerInitialized) + initTargetStreamer(M); + // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; @@ -234,8 +247,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { auto &ObjectFileInfo = *Context.getObjectFileInfo(); auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); - Streamer.PushSection(); - Streamer.SwitchSection(&ReadOnlySection); + Streamer.pushSection(); + Streamer.switchSection(&ReadOnlySection); // CP microcode requires the kernel descriptor to be allocated on 64 byte // alignment. @@ -256,7 +269,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); - Streamer.PopSection(); + Streamer.popSection(); } void AMDGPUAsmPrinter::emitFunctionEntryLabel() { @@ -319,7 +332,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { const DataLayout &DL = GV->getParent()->getDataLayout(); uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); - Align Alignment = GV->getAlign().getValueOr(Align(4)); + Align Alignment = GV->getAlign().value_or(Align(4)); emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); emitLinkage(GV, GVSym); @@ -339,7 +352,7 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && (STI.getTargetTriple().getOS() == Triple::AMDHSA || STI.getTargetTriple().getOS() == Triple::AMDPAL)) { - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + OutStreamer->switchSection(getObjFileLowering().getTextSection()); getTargetStreamer()->EmitCodeEnd(STI); } @@ -381,7 +394,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr()) { + if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -437,6 +450,11 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + // Init target streamer lazily on the first function so that previous passes + // can set metadata. + if (!IsTargetStreamerInitialized) + initTargetStreamer(*MF.getFunction().getParent()); + ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); CurrentProgramInfo = SIProgramInfo(); @@ -454,7 +472,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); + OutStreamer->switchSection(ConfigSection); } if (MFI->isModuleEntryFunction()) { @@ -491,7 +509,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(CommentSection); + OutStreamer->switchSection(CommentSection); if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); @@ -590,7 +608,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (DumpCodeInstEmitter) { - OutStreamer->SwitchSection( + OutStreamer->switchSection( Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { @@ -677,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; const uint64_t MaxScratchPerWorkitem = - GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize(); + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ProgInfo.ScratchSize, @@ -857,22 +875,18 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, LDSAlignShift = 9; } - unsigned LDSSpillSize = - MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize(); - - ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize; + ProgInfo.LDSSize = MFI->getLDSSize(); ProgInfo.LDSBlocks = alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; + // Scratch is allocated in 64-dword or 256-dword blocks. + unsigned ScratchAlignShift = + STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; // We need to program the hardware with the amount of scratch memory that // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - ProgInfo.ScratchBlocks = - alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1ULL << ScratchAlignShift) >> - ScratchAlignShift; + ProgInfo.ScratchBlocks = divideCeil( + ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; @@ -886,8 +900,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, else if (MFI->hasWorkItemIDY()) TIDIGCompCnt = 1; + // The private segment wave byte offset is the last of the system SGPRs. We + // initially assumed it was allocated, and may have used it. It shouldn't harm + // anything to disable it if we know the stack isn't used here. We may still + // have emitted code reading it to initialize scratch, but if that's unused + // reading garbage should be OK. + const bool EnablePrivateSegment = ProgInfo.ScratchBlocks > 0; ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_SCRATCH_EN(EnablePrivateSegment) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | @@ -931,6 +951,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -942,7 +963,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); - OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. @@ -951,14 +975,18 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); - OutStreamer->emitIntValue( - S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32( + STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) + : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); - OutStreamer->emitInt32( - S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); OutStreamer->emitInt32(MFI->getPSInputEnable()); OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); @@ -984,6 +1012,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, MD->setEntryPoint(CC, MF.getFunction().getName()); MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); + + // Only set AGPRs for supported devices + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + if (STM.hasMAIInsts()) { + MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); + } + MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); if (AMDGPU::isCompute(CC)) { @@ -995,12 +1030,14 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, // ScratchSize is in bytes, 16 aligned. MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 + ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) + : CurrentProgramInfo.LDSBlocks; + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); MD->setSpiPsInputEna(MFI->getPSInputEnable()); MD->setSpiPsInputAddr(MFI->getPSInputAddr()); } - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); if (STM.isWave32()) MD->setWave32(MF.getFunction().getCallingConv()); } @@ -1067,7 +1104,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr()) + if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index d5c60aa3be7d..ddda2cf107b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -77,6 +77,8 @@ private: const MachineFunction &MF, const SIProgramInfo &PI) const; + void initTargetStreamer(Module &M); + public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); @@ -132,6 +134,7 @@ protected: std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; + bool IsTargetStreamerInitialized; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1e2cf3890d0a..3ccfd9dde269 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -311,6 +311,12 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, if (ST->isWave32()) return V; + if (ST->hasPermLane64()) { + // Reduce across the upper and lower 32 lanes. + return buildNonAtomicBinOp( + B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); + } + // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def new file mode 100644 index 000000000000..0a2cf3874245 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def @@ -0,0 +1,31 @@ +//===--- AMDGPUAttributes.def ---------------------------------*- C++ -*---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains descriptions of the various function attributes +// that indicate *absence* of the corresponding implicit kernel +// arguments. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +AMDGPU_ATTRIBUTE(DISPATCH_PTR, "amdgpu-no-dispatch-ptr") +AMDGPU_ATTRIBUTE(QUEUE_PTR, "amdgpu-no-queue-ptr") +AMDGPU_ATTRIBUTE(DISPATCH_ID, "amdgpu-no-dispatch-id") +AMDGPU_ATTRIBUTE(IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr") +AMDGPU_ATTRIBUTE(MULTIGRID_SYNC_ARG, "amdgpu-no-multigrid-sync-arg") +AMDGPU_ATTRIBUTE(HOSTCALL_PTR, "amdgpu-no-hostcall-ptr") +AMDGPU_ATTRIBUTE(HEAP_PTR, "amdgpu-no-heap-ptr") +AMDGPU_ATTRIBUTE(WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x") +AMDGPU_ATTRIBUTE(WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y") +AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z") +AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x") +AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y") +AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z") + +#undef AMDGPU_ATTRIBUTE diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index b4ebc7d7d75f..8de0d7e6bff1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -12,6 +12,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -22,37 +23,25 @@ using namespace llvm; -enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, +#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, + +enum ImplicitArgumentPositions { + #include "AMDGPUAttributes.def" + LAST_ARG_POS +}; - // SGPRs - DISPATCH_PTR = 1 << 0, - QUEUE_PTR = 1 << 1, - DISPATCH_ID = 1 << 2, - IMPLICIT_ARG_PTR = 1 << 3, - WORKGROUP_ID_X = 1 << 4, - WORKGROUP_ID_Y = 1 << 5, - WORKGROUP_ID_Z = 1 << 6, +#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, - // VGPRS: - WORKITEM_ID_X = 1 << 7, - WORKITEM_ID_Y = 1 << 8, - WORKITEM_ID_Z = 1 << 9, - ALL_ARGUMENT_MASK = (1 << 10) - 1 +enum ImplicitArgumentMask { + NOT_IMPLICIT_INPUT = 0, + #include "AMDGPUAttributes.def" + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 }; +#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, static constexpr std::pair<ImplicitArgumentMask, StringLiteral> ImplicitAttrs[] = { - {DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, - {QUEUE_PTR, "amdgpu-no-queue-ptr"}, - {DISPATCH_ID, "amdgpu-no-dispatch-id"}, - {IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, - {WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, - {WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"}, - {WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"}, - {WORKITEM_ID_X, "amdgpu-no-workitem-id-x"}, - {WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"}, - {WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"} + #include "AMDGPUAttributes.def" }; // We do not need to note the x workitem or workgroup id because they are always @@ -61,7 +50,9 @@ static constexpr std::pair<ImplicitArgumentMask, // TODO: We should not add the attributes if the known compile time workgroup // size is 1 for y/z. static ImplicitArgumentMask -intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { +intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, + bool HasApertureRegs, bool SupportsGetDoorBellID) { + unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; @@ -87,13 +78,23 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { return DISPATCH_ID; case Intrinsic::amdgcn_implicitarg_ptr: return IMPLICIT_ARG_PTR; + // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access + // queue_ptr. case Intrinsic::amdgcn_queue_ptr: + NeedsImplicit = (CodeObjectVersion == 5); + return QUEUE_PTR; case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - // TODO: Does not require queue ptr on gfx9+ + if (HasApertureRegs) + return NOT_IMPLICIT_INPUT; + // Under V5, we need implicitarg_ptr + offsets to access private_base or + // shared_base. For pre-V5, however, need to access them through queue_ptr + + // offsets. + return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; case Intrinsic::trap: - case Intrinsic::debugtrap: - IsQueuePtr = true; + if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. + return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; + NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5. return QUEUE_PTR; default: return NOT_IMPLICIT_INPUT; @@ -114,7 +115,7 @@ static bool isDSAddress(const Constant *C) { /// Returns true if the function requires the implicit argument be passed /// regardless of the function contents. -static bool funcRequiresImplicitArgPtr(const Function &F) { +static bool funcRequiresHostcallPtr(const Function &F) { // Sanitizers require the hostcall buffer passed in the implicit arguments. return F.hasFnAttribute(Attribute::SanitizeAddress) || F.hasFnAttribute(Attribute::SanitizeThread) || @@ -140,6 +141,12 @@ public: return ST.hasApertureRegs(); } + /// Check if the subtarget supports GetDoorbellID. + bool supportsGetDoorbellID(Function &F) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + return ST.supportsGetDoorbellID(); + } + std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) { const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); return ST.getFlatWorkGroupSizes(F); @@ -152,7 +159,7 @@ public: } private: - /// Check if the ConstantExpr \p CE requires queue ptr attribute. + /// Check if the ConstantExpr \p CE requires the queue pointer. static bool visitConstExpr(const ConstantExpr *CE) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); @@ -186,7 +193,7 @@ private: } public: - /// Returns true if \p Fn needs a queue ptr attribute because of \p C. + /// Returns true if \p Fn needs the queue pointer because of \p C. bool needsQueuePtr(const Constant *C, Function &Fn) { bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()); bool HasAperture = hasApertureRegs(Fn); @@ -205,7 +212,7 @@ public: } private: - /// Used to determine if the Constant needs a queue ptr attribute. + /// Used to determine if the Constant needs the queue pointer. DenseMap<const Constant *, uint8_t> ConstantStatus; }; @@ -353,12 +360,15 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // If the function requires the implicit arg pointer due to sanitizers, // assume it's needed even if explicitly marked as not requiring it. - const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F); - if (NeedsImplicit) + const bool NeedsHostcall = funcRequiresHostcallPtr(*F); + if (NeedsHostcall) { removeAssumedBits(IMPLICIT_ARG_PTR); + removeAssumedBits(HOSTCALL_PTR); + } for (auto Attr : ImplicitAttrs) { - if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR) + if (NeedsHostcall && + (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR)) continue; if (F->hasFnAttribute(Attr.second)) @@ -388,9 +398,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { return indicatePessimisticFixpoint(); bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); - auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); - bool NeedsQueuePtr = false; + bool NeedsImplicit = false; + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + bool HasApertureRegs = InfoCache.hasApertureRegs(*F); + bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); for (Function *Callee : AAEdges.getOptimisticEdges()) { Intrinsic::ID IID = Callee->getIntrinsicID(); @@ -403,20 +415,87 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { bool NonKernelOnly = false; ImplicitArgumentMask AttrMask = - intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr); + intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, + HasApertureRegs, SupportsGetDoorbellID); if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); } } - // If we found that we need amdgpu-queue-ptr, nothing else to do. - if (NeedsQueuePtr) { + // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. + if (NeedsImplicit) + removeAssumedBits(IMPLICIT_ARG_PTR); + + if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { + // Under V5, we need implicitarg_ptr + offsets to access private_base or + // shared_base. We do not actually need queue_ptr. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) + removeAssumedBits(IMPLICIT_ARG_PTR); + else + removeAssumedBits(QUEUE_PTR); + } + + if (funcRetrievesMultigridSyncArg(A)) { + assert(!isAssumed(IMPLICIT_ARG_PTR) && + "multigrid_sync_arg needs implicitarg_ptr"); + removeAssumedBits(MULTIGRID_SYNC_ARG); + } + + if (funcRetrievesHostcallPtr(A)) { + assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); + removeAssumedBits(HOSTCALL_PTR); + } + + if (funcRetrievesHeapPtr(A)) { + assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); + removeAssumedBits(HEAP_PTR); + } + + if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) { + assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); removeAssumedBits(QUEUE_PTR); - return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : - ChangeStatus::UNCHANGED; } + return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED + : ChangeStatus::UNCHANGED; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector<Attribute, 8> AttrList; + LLVMContext &Ctx = getAssociatedFunction()->getContext(); + + for (auto Attr : ImplicitAttrs) { + if (isKnown(Attr.first)) + AttrList.push_back(Attribute::get(Ctx, Attr.second)); + } + + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + std::string Str; + raw_string_ostream OS(Str); + OS << "AMDInfo["; + for (auto Attr : ImplicitAttrs) + OS << ' ' << Attr.second; + OS << " ]"; + return OS.str(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + +private: + bool checkForQueuePtr(Attributor &A) { + Function *F = getAssociatedFunction(); + bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); + + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + + bool NeedsQueuePtr = false; + auto CheckAddrSpaceCasts = [&](Instruction &I) { unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace(); if (castRequiresQueuePtr(SrcAS)) { @@ -431,7 +510,7 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { // `checkForAllInstructions` is much more cheaper than going through all // instructions, try it first. - // amdgpu-queue-ptr is not needed if aperture regs is present. + // The queue pointer is not needed if aperture regs is present. if (!HasApertureRegs) { bool UsedAssumedInformation = false; A.checkForAllInstructions(CheckAddrSpaceCasts, *this, @@ -439,61 +518,79 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { UsedAssumedInformation); } - // If we found that we need amdgpu-queue-ptr, nothing else to do. - if (NeedsQueuePtr) { - removeAssumedBits(QUEUE_PTR); - return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : - ChangeStatus::UNCHANGED; - } + // If we found that we need the queue pointer, nothing else to do. + if (NeedsQueuePtr) + return true; - if (!IsNonEntryFunc && HasApertureRegs) { - return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : - ChangeStatus::UNCHANGED; - } + if (!IsNonEntryFunc && HasApertureRegs) + return false; for (BasicBlock &BB : *F) { for (Instruction &I : BB) { for (const Use &U : I.operands()) { if (const auto *C = dyn_cast<Constant>(U)) { - if (InfoCache.needsQueuePtr(C, *F)) { - removeAssumedBits(QUEUE_PTR); - return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : - ChangeStatus::UNCHANGED; - } + if (InfoCache.needsQueuePtr(C, *F)) + return true; } } } } - return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : - ChangeStatus::UNCHANGED; + return false; } - ChangeStatus manifest(Attributor &A) override { - SmallVector<Attribute, 8> AttrList; - LLVMContext &Ctx = getAssociatedFunction()->getContext(); + bool funcRetrievesMultigridSyncArg(Attributor &A) { + auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(); + AAPointerInfo::OffsetAndSize OAS(Pos, 8); + return funcRetrievesImplicitKernelArg(A, OAS); + } - for (auto Attr : ImplicitAttrs) { - if (isKnown(Attr.first)) - AttrList.push_back(Attribute::get(Ctx, Attr.second)); - } + bool funcRetrievesHostcallPtr(Attributor &A) { + auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(); + AAPointerInfo::OffsetAndSize OAS(Pos, 8); + return funcRetrievesImplicitKernelArg(A, OAS); + } - return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, - /* ForceReplace */ true); + bool funcRetrievesHeapPtr(Attributor &A) { + if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) + return false; + AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); + return funcRetrievesImplicitKernelArg(A, OAS); } - const std::string getAsStr() const override { - std::string Str; - raw_string_ostream OS(Str); - OS << "AMDInfo["; - for (auto Attr : ImplicitAttrs) - OS << ' ' << Attr.second; - OS << " ]"; - return OS.str(); + bool funcRetrievesQueuePtr(Attributor &A) { + if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) + return false; + AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); + return funcRetrievesImplicitKernelArg(A, OAS); } - /// See AbstractAttribute::trackStatistics() - void trackStatistics() const override {} + bool funcRetrievesImplicitKernelArg(Attributor &A, + AAPointerInfo::OffsetAndSize OAS) { + // Check if this is a call to the implicitarg_ptr builtin and it + // is used to retrieve the hostcall pointer. The implicit arg for + // hostcall is not used only if every use of the implicitarg_ptr + // is a load that clearly does not retrieve any byte of the + // hostcall pointer. We check this by tracing all the uses of the + // initial call to the implicitarg_ptr intrinsic. + auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) { + auto &Call = cast<CallBase>(I); + if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) + return true; + + const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>( + *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); + + return PointerInfoAA.forallInterferingAccesses( + OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) { + return Acc.getRemoteInst()->isDroppable(); + }); + }; + + bool UsedAssumedInformation = false; + return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, + UsedAssumedInformation); + } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, @@ -646,9 +743,14 @@ public: AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); DenseSet<const char *> Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, - &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID}); + &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID}); + + AttributorConfig AC(CGUpdater); + AC.Allowed = &Allowed; + AC.IsModulePass = true; + AC.DefaultInitializeLiveInternals = false; - Attributor A(Functions, InfoCache, CGUpdater, &Allowed); + Attributor A(Functions, InfoCache, AC); for (Function &F : M) { if (!F.isIntrinsic()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index cd084fd5440a..fd812eb676ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #define DEBUG_TYPE "amdgpu-call-lowering" @@ -349,7 +350,6 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, FunctionLoweringInfo &FLI) const { MachineFunction &MF = B.getMF(); - MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MFI->setIfReturnsVoid(!Val); @@ -365,40 +365,15 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, return true; } - auto const &ST = MF.getSubtarget<GCNSubtarget>(); - - unsigned ReturnOpc = 0; - if (IsShader) - ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG; - else if (CC == CallingConv::AMDGPU_Gfx) - ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx; - else - ReturnOpc = AMDGPU::S_SETPC_B64_return; - + unsigned ReturnOpc = + IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN; auto Ret = B.buildInstrNoInsert(ReturnOpc); - Register ReturnAddrVReg; - if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { - ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); - Ret.addUse(ReturnAddrVReg); - } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) { - ReturnAddrVReg = - MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass); - Ret.addUse(ReturnAddrVReg); - } if (!FLI.CanLowerReturn) insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister); else if (!lowerReturnVal(B, Val, VRegs, Ret)) return false; - if (ReturnOpc == AMDGPU::S_SETPC_B64_return || - ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) { - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), - &AMDGPU::SGPR_64RegClass); - B.buildCopy(ReturnAddrVReg, LiveInReturn); - } - // TODO: Handle CalleeSavedRegsViaCopy. B.insertInstr(Ret); @@ -479,7 +454,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -523,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( const SITargetLowering &TLI = *getTLI<SITargetLowering>(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F.getParent()); + Info->allocateModuleLDSGlobal(F); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -543,9 +518,8 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( if (AllocSize == 0) continue; - MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; - if (!ABIAlign) - ABIAlign = DL.getABITypeAlign(ArgTy); + MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None; + Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; @@ -608,19 +582,11 @@ bool AMDGPUCallLowering::lowerFormalArguments( const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F.getParent()); + Info->allocateModuleLDSGlobal(F); SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); - if (!IsEntryFunc) { - Register ReturnAddrReg = TRI->getReturnAddressReg(MF); - Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, - &AMDGPU::SGPR_64RegClass); - MBB.addLiveIn(ReturnAddrReg); - B.buildCopy(LiveInReturn, ReturnAddrReg); - } - if (Info->hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 1682d43ae671..b6c66077675f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -148,53 +148,32 @@ def CSR_AMDGPU_VGPRs : CalleeSavedRegs< (sequence "VGPR%u", 248, 255)) >; -def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs< +def CSR_AMDGPU_AGPRs : CalleeSavedRegs< (sequence "AGPR%u", 32, 255) >; -def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< - (sequence "SGPR%u", 32, 105) +def CSR_AMDGPU_SGPRs : CalleeSavedRegs< + (sequence "SGPR%u", 30, 105) >; -def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs< - (sequence "SGPR%u", 4, 29) +def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs< + (add (sequence "SGPR%u", 4, 31), (sequence "SGPR%u", 64, 105)) >; -def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs< - (sequence "SGPR%u", 64, 105) +def CSR_AMDGPU : CalleeSavedRegs< + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs) >; -// Just to get the regmask, not for calling convention purposes. -def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< - (sequence "VGPR%u", 0, 255) ->; - -def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs< - (sequence "AGPR%u", 0, 255) ->; -def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs< - (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs) ->; - -// Just to get the regmask, not for calling convention purposes. -def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< - (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) ->; - -def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) ->; - -def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs< - (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255) +def CSR_AMDGPU_GFX90AInsts : CalleeSavedRegs< + (add CSR_AMDGPU, CSR_AMDGPU_AGPRs) >; def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105) + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs) >; -def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs< - (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255) +def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs< + (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs) >; def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; @@ -233,3 +212,24 @@ def CC_AMDGPU : CallingConv<[ "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", CCDelegateTo<CC_AMDGPU_Func>> ]>; + +// Trivial class to denote when a def is used only to get a RegMask, i.e. +// SaveList is ignored and the def is not used as part of any calling +// convention. +class RegMask<dag mask> : CalleeSavedRegs<mask>; + +def AMDGPU_AllVGPRs : RegMask< + (sequence "VGPR%u", 0, 255) +>; + +def AMDGPU_AllAGPRs : RegMask< + (sequence "AGPR%u", 0, 255) +>; + +def AMDGPU_AllVectorRegs : RegMask< + (add AMDGPU_AllVGPRs, AMDGPU_AllAGPRs) +>; + +def AMDGPU_AllAllocatableSRegs : RegMask< + (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) +>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 1920684d8f1f..94d7844e8a32 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -877,7 +877,7 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { return getMul64(Builder, LHS, RHS).second; } -/// Figure out how many bits are really needed for this ddivision. \p AtLeast is +/// Figure out how many bits are really needed for this division. \p AtLeast is /// an optimization hint to bypass the second ComputeNumSignBits call if we the /// first one is insufficient. Returns -1 on failure. int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index e79ff9b597c9..c16d8ee51a7a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -373,7 +373,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, replaceRegWith(MRI, Dst, NegatedMatchInfo); // Recreate non negated value for other uses of old MatchInfoDst - Builder.setInstrAndDebugLoc(MI); + auto NextInst = ++MatchInfo->getIterator(); + Builder.setInstrAndDebugLoc(*NextInst); Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp index 04bf623bfa46..8fcf669041b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -50,7 +50,7 @@ public: } bool createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) { - if (!GV) + if (!GV || !GV->hasInitializer()) return false; ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer()); if (!GA || GA->getNumOperands() == 0) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp index bed0707f3aa7..8236ff609f85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp @@ -22,7 +22,7 @@ namespace { class ExportClustering : public ScheduleDAGMutation { public: - ExportClustering() {} + ExportClustering() = default; void apply(ScheduleDAGInstrs *DAG) override; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp deleted file mode 100644 index ea6c6d0fd212..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp +++ /dev/null @@ -1,64 +0,0 @@ -//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// Promote indirect (bitcast) calls to direct calls when they are statically -/// known to be direct. Required when InstCombine is not run (e.g. at OptNone) -/// because AMDGPU does not support indirect calls. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Utils/CallPromotionUtils.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-fix-function-bitcasts" - -namespace { -class AMDGPUFixFunctionBitcasts final - : public ModulePass, - public InstVisitor<AMDGPUFixFunctionBitcasts> { - - bool runOnModule(Module &M) override; - - bool Modified; - -public: - void visitCallBase(CallBase &CB) { - if (CB.getCalledFunction()) - return; - auto *Callee = - dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts()); - if (Callee && isLegalToPromote(CB, Callee)) { - promoteCall(CB, Callee); - Modified = true; - } - } - - static char ID; - AMDGPUFixFunctionBitcasts() : ModulePass(ID) {} -}; -} // End anonymous namespace - -char AMDGPUFixFunctionBitcasts::ID = 0; -char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID; -INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE, - "Fix function bitcasts for AMDGPU", false, false) - -ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() { - return new AMDGPUFixFunctionBitcasts(); -} - -bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) { - Modified = false; - visit(M); - return Modified; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7fd94a977be7..5747fc0ca8e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -47,10 +47,30 @@ def gi_vop3pmods : GIComplexOperandMatcher<s32, "selectVOP3PMods">, GIComplexPatternEquiv<VOP3PMods>; +def gi_vop3pmodsdot : + GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, + GIComplexPatternEquiv<VOP3PModsDOT>; + +def gi_dotiuvop3pmods : + GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">, + GIComplexPatternEquiv<DotIUVOP3PMods>; + +def gi_wmmaopselvop3pmods : + GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, + GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; + def gi_vop3opselmods : GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, GIComplexPatternEquiv<VOP3OpSelMods>; +def gi_vinterpmods : + GIComplexOperandMatcher<s32, "selectVINTERPMods">, + GIComplexPatternEquiv<VINTERPMods>; + +def gi_vinterpmods_hi : + GIComplexOperandMatcher<s32, "selectVINTERPModsHi">, + GIComplexPatternEquiv<VINTERPModsHi>; + // FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods? def gi_vop3opsel : GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, @@ -93,6 +113,10 @@ def gi_flat_scratch_saddr : GIComplexOperandMatcher<s32, "selectScratchSAddr">, GIComplexPatternEquiv<ScratchSAddr>; +def gi_flat_scratch_svaddr : + GIComplexOperandMatcher<s32, "selectScratchSVAddr">, + GIComplexPatternEquiv<ScratchSVAddr>; + def gi_ds_1addr_1offset : GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">, GIComplexPatternEquiv<DS1Addr1Offset>; @@ -123,7 +147,7 @@ def gi_smrd_buffer_imm32 : // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization -// directly before before selecting a glue-less load, so hide this +// directly before selecting a glue-less load, so hide this // distinction. def : GINodeEquiv<G_LOAD, AMDGPUld_glue> { @@ -222,6 +246,9 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; +def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>; +def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>; + class GISelSop2Pat < SDPatternOperator node, Instruction inst, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index cabdc6998011..1bbdc39a7a5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -7,8 +7,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPUGlobalISelUtils.h" +#include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/IR/Constants.h" +#include "llvm/Support/LowLevelTypeImpl.h" using namespace llvm; using namespace MIPatternMatch; @@ -66,3 +68,12 @@ bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { return true; return (Mask[0] & 2) == (Mask[1] & 2); } + +bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, + const LLT &Ty) { + if (Ty == LLT::scalar(32)) + return Subtarget.hasAtomicFaddRtnInsts(); + if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64)) + return Subtarget.hasGFX90AInsts(); + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 14d3a3fb7997..5c600d059b7a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -16,6 +16,8 @@ namespace llvm { class MachineRegisterInfo; +class GCNSubtarget; +class LLT; namespace AMDGPU { @@ -24,7 +26,7 @@ std::pair<Register, unsigned> getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg); bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask); - +bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index f5018e3a19ac..6fa44ffcbfaa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -400,17 +400,15 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func, auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); - // Emit "printf buffer" argument if printf is used, otherwise emit dummy - // "none" argument. if (HiddenArgNumBytes >= 32) { + // We forbid the use of features requiring hostcall when compiling OpenCL + // before code object V5, which makes the mutual exclusion between the + // "printf buffer" and "hostcall buffer" here sound. if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer); - else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { - // The printf runtime binding pass should have ensured that hostcall and - // printf are not used in the same module. - assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); + else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer); - } else + else emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); } @@ -427,8 +425,12 @@ void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func, } // Emit the pointer argument for multi-grid object. - if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg); + if (HiddenArgNumBytes >= 56) { + if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg); + else + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); + } } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -803,6 +805,8 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF, auto &DL = M->getDataLayout(); auto Int64Ty = Type::getInt64Ty(Func.getContext()); + Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); + if (HiddenArgNumBytes >= 8) emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args); @@ -816,19 +820,17 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF, auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); - // Emit "printf buffer" argument if printf is used, emit "hostcall buffer" - // if "hostcall" module flag is set, otherwise emit dummy "none" argument. if (HiddenArgNumBytes >= 32) { + // We forbid the use of features requiring hostcall when compiling OpenCL + // before code object V5, which makes the mutual exclusion between the + // "printf buffer" and "hostcall buffer" here sound. if (M->getNamedMetadata("llvm.printf.fmts")) emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, Args); - else if (M->getModuleFlag("amdgpu_hostcall")) { - // The printf runtime binding pass should have ensured that hostcall and - // printf are not used in the same module. - assert(!M->getNamedMetadata("llvm.printf.fmts")); + else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, Args); - } else + else emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } @@ -847,9 +849,14 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF, } // Emit the pointer argument for multi-grid object. - if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, - Args); + if (HiddenArgNumBytes >= 56) { + if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, + Args); + } else { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + } + } } msgpack::MapDocNode @@ -876,6 +883,12 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, Kern.getDocument()->getNode(STM.getWavefrontSize()); Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR); Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR); + + // Only add AGPR count to metadata for supported devices + if (STM.hasMAIInsts()) { + Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR); + } + Kern[".max_flat_workgroup_size"] = Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize()); Kern[".sgpr_spill_count"] = @@ -971,13 +984,20 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF, msgpack::ArrayDocNode Args) { auto &Func = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + + // No implicit kernel argument is used. + if (ST.getImplicitArgNumBytes(Func) == 0) + return; + const Module *M = Func.getParent(); auto &DL = M->getDataLayout(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); auto Int64Ty = Type::getInt64Ty(Func.getContext()); auto Int32Ty = Type::getInt32Ty(Func.getContext()); auto Int16Ty = Type::getInt16Ty(Func.getContext()); + Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr()); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args); @@ -1008,40 +1028,49 @@ void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF, if (M->getNamedMetadata("llvm.printf.fmts")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, Args); - } else + } else { Offset += 8; // Skipped. + } - if (M->getModuleFlag("amdgpu_hostcall")) { + if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, Args); - } else + } else { Offset += 8; // Skipped. + } - emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, + if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, Args); + } else { + Offset += 8; // Skipped. + } - // Ignore temporarily until it is implemented. - // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args); - Offset += 8; + if (!Func.hasFnAttribute("amdgpu-no-heap-ptr")) + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args); + else + Offset += 8; // Skipped. if (Func.hasFnAttribute("calls-enqueue-kernel")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, Args); emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); - } else + } else { Offset += 16; // Skipped. + } Offset += 72; // Reserved. - // hidden_private_base and hidden_shared_base are only used by GFX8. - if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // hidden_private_base and hidden_shared_base are only when the subtarget has + // ApertureRegs. + if (!ST.hasApertureRegs()) { emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args); - } else + } else { Offset += 8; // Skipped. + } - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); if (MFI.hasQueuePtr()) emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index bcf7fc449094..9b22d1f4d1b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -42,7 +42,7 @@ namespace HSAMD { class MetadataStreamer { public: - virtual ~MetadataStreamer(){}; + virtual ~MetadataStreamer() = default; virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp new file mode 100644 index 000000000000..5c507ef70a8c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -0,0 +1,439 @@ +//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file This file defines a set of schedule DAG mutations that can be used to +// override default scheduler behavior to enforce specific scheduling patterns. +// They should be used in cases where runtime performance considerations such as +// inter-wavefront interactions, mean that compile-time heuristics cannot +// predict the optimal instruction ordering, or in kernels where optimum +// instruction scheduling is important enough to warrant manual intervention. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUIGroupLP.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetOpcodes.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +static cl::opt<bool> + EnableIGroupLP("amdgpu-igrouplp", + cl::desc("Enable construction of Instruction Groups and " + "their ordering for scheduling"), + cl::init(false)); + +static cl::opt<Optional<unsigned>> + VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in VMEM group.")); + +static cl::opt<Optional<unsigned>> + MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in MFMA group.")); + +static cl::opt<Optional<unsigned>> + LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds read group.")); + +static cl::opt<Optional<unsigned>> + LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds write group.")); + +typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)> + CanAddMIFn; + +// Classify instructions into groups to enable fine tuned control over the +// scheduler. These groups may be more specific than current SchedModel +// instruction classes. +class SchedGroup { +private: + // Function that returns true if a non-bundle MI may be inserted into this + // group. + const CanAddMIFn canAddMI; + + // Maximum number of SUnits that can be added to this group. + Optional<unsigned> MaxSize; + + // Collection of SUnits that are classified as members of this group. + SmallVector<SUnit *, 32> Collection; + + ScheduleDAGInstrs *DAG; + + void tryAddEdge(SUnit *A, SUnit *B) { + if (A != B && DAG->canAddEdge(B, A)) { + DAG->addEdge(B, SDep(A, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "Adding edge...\n" + << "from: SU(" << A->NodeNum << ") " << *A->getInstr() + << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); + } + } + +public: + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If + // MakePred is true, SU will be a predecessor of the SUnits in this + // SchedGroup, otherwise SU will be a successor. + void link(SUnit &SU, bool MakePred = false) { + for (auto A : Collection) { + SUnit *B = &SU; + if (MakePred) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use + // the predicate to determine whether SU should be a predecessor (P = true) + // or a successor (P = false) of this SchedGroup. + void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) { + for (auto A : Collection) { + SUnit *B = &SU; + if (P(A, B)) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies such that SUnits in this group shall be ordered + // before SUnits in OtherGroup. + void link(SchedGroup &OtherGroup) { + for (auto B : OtherGroup.Collection) + link(*B); + } + + // Returns true if no more instructions may be added to this group. + bool isFull() { return MaxSize && Collection.size() >= *MaxSize; } + + // Returns true if SU can be added to this SchedGroup. + bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { + if (isFull()) + return false; + + MachineInstr &MI = *SU.getInstr(); + if (MI.getOpcode() != TargetOpcode::BUNDLE) + return canAddMI(MI, TII); + + // Special case for bundled MIs. + const MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; + while (E != MBB->end() && E->isBundledWithPred()) + ++E; + + // Return true if all of the bundled MIs can be added to this group. + return std::all_of( + B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); + } + + void add(SUnit &SU) { Collection.push_back(&SU); } + + SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize, + ScheduleDAGInstrs *DAG) + : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} +}; + +bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isMFMA(MI); +} + +bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVALU(MI) && !TII->isMFMA(MI); +} + +bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isSALU(MI); +} + +bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); +} + +bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && TII->isDS(MI); +} + +bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && TII->isDS(MI); +} + +class IGroupLPDAGMutation : public ScheduleDAGMutation { +public: + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + + IGroupLPDAGMutation() = default; + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +// DAG mutation that coordinates with the SCHED_BARRIER instruction and +// corresponding builtin. The mutation adds edges from specific instruction +// classes determined by the SCHED_BARRIER mask so that they cannot be +// scheduled around the SCHED_BARRIER. +class SchedBarrierDAGMutation : public ScheduleDAGMutation { +private: + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + // Components of the mask that determines which instructions may not be + // scheduled across the SCHED_BARRIER. + enum class SchedBarrierMasks { + NONE = 0u, + ALU = 1u << 0, + VALU = 1u << 1, + SALU = 1u << 2, + MFMA = 1u << 3, + VMEM = 1u << 4, + VMEM_READ = 1u << 5, + VMEM_WRITE = 1u << 6, + DS = 1u << 7, + DS_READ = 1u << 8, + DS_WRITE = 1u << 9, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) + }; + + // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a + // region. + // + std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr; + std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr; + std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr; + std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr; + std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr; + std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr; + std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr; + + // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should + // not be reordered accross the SCHED_BARRIER. + void getSchedGroupsFromMask(int32_t Mask, + SmallVectorImpl<SchedGroup *> &SchedGroups); + + // Add DAG edges that enforce SCHED_BARRIER ordering. + void addSchedBarrierEdges(SUnit &SU); + + // Classify instructions and add them to the SchedGroup. + void initSchedGroup(SchedGroup *SG); + + // Remove all existing edges from a SCHED_BARRIER. + void resetSchedBarrierEdges(SUnit &SU); + +public: + void apply(ScheduleDAGInstrs *DAGInstrs) override; + + SchedBarrierDAGMutation() = default; +}; + +void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); + + // The order of InstructionGroups in this vector defines the + // order in which edges will be added. In other words, given the + // present ordering, we will try to make each VMEMRead instruction + // a predecessor of each DSRead instruction, and so on. + SmallVector<SchedGroup, 4> PipelineOrderGroups = { + SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), + SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), + SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), + SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + + for (SUnit &SU : DAG->SUnits) { + LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); + for (auto &SG : PipelineOrderGroups) + if (SG.canAddSU(SU, TII)) + SG.add(SU); + } + + for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { + auto &GroupA = PipelineOrderGroups[i]; + for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { + auto &GroupB = PipelineOrderGroups[j]; + GroupA.link(GroupB); + } + } +} + +void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAGInstrs->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); + + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); + for (auto &SU : DAG->SUnits) + if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) + addSchedBarrierEdges(SU); +} + +void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { + MachineInstr &MI = *SchedBarrier.getInstr(); + assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); + // Remove all existing edges from the SCHED_BARRIER that were added due to the + // instruction having side effects. + resetSchedBarrierEdges(SchedBarrier); + SmallVector<SchedGroup *, 4> SchedGroups; + int32_t Mask = MI.getOperand(0).getImm(); + getSchedGroupsFromMask(Mask, SchedGroups); + for (auto SG : SchedGroups) + SG->link( + SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( + const SUnit *A, const SUnit *B) { + return A->NodeNum > B->NodeNum; + }); +} + +void SchedBarrierDAGMutation::getSchedGroupsFromMask( + int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) { + SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; + // See IntrinsicsAMDGPU.td for an explanation of these masks and their + // mappings. + // + if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!VALUSchedGroup) { + VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG); + initSchedGroup(VALUSchedGroup.get()); + } + + SchedGroups.push_back(VALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!SALUSchedGroup) { + SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG); + initSchedGroup(SALUSchedGroup.get()); + } + + SchedGroups.push_back(SALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!MFMASchedGroup) { + MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG); + initSchedGroup(MFMASchedGroup.get()); + } + + SchedGroups.push_back(MFMASchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMReadSchedGroup) { + VMEMReadSchedGroup = + std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG); + initSchedGroup(VMEMReadSchedGroup.get()); + } + + SchedGroups.push_back(VMEMReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMWriteSchedGroup) { + VMEMWriteSchedGroup = + std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG); + initSchedGroup(VMEMWriteSchedGroup.get()); + } + + SchedGroups.push_back(VMEMWriteSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSReadSchedGroup) { + DSReadSchedGroup = + std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG); + initSchedGroup(DSReadSchedGroup.get()); + } + + SchedGroups.push_back(DSReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSWriteSchedGroup) { + DSWriteSchedGroup = + std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG); + initSchedGroup(DSWriteSchedGroup.get()); + } + + SchedGroups.push_back(DSWriteSchedGroup.get()); + } +} + +void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { + assert(SG); + for (auto &SU : DAG->SUnits) + if (SG->canAddSU(SU, TII)) + SG->add(SU); +} + +void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { + assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); + for (auto &P : SU.Preds) + SU.removePred(P); + + for (auto &S : SU.Succs) { + for (auto &SP : S.getSUnit()->Preds) { + if (SP.getSUnit() == &SU) { + S.getSUnit()->removePred(SP); + } + } + } +} + +} // namespace + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { + return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr; +} + +std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() { + return std::make_unique<SchedBarrierDAGMutation>(); +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h new file mode 100644 index 000000000000..aeb1bbad3705 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -0,0 +1,22 @@ +//===- AMDGPUMFMAIGroupLP.h - AMDGPU MFMA IGroupLP --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(); +std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 8236e6672247..b00df27f5fd3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,7 +13,9 @@ #include "AMDGPUISelDAGToDAG.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600RegisterInfo.h" #include "SIMachineFunctionInfo.h" @@ -679,9 +681,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::FMA: SelectFMAD_FMA(N); return; - case AMDGPUISD::ATOMIC_CMP_SWAP: - SelectATOMIC_CMP_SWAP(N); - return; case AMDGPUISD::CVT_PKRTZ_F16_F32: case AMDGPUISD::CVT_PKNORM_I16_F32: case AMDGPUISD::CVT_PKNORM_U16_F32: @@ -1008,7 +1007,12 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; - unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + unsigned Opc; + if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 + : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else + Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), @@ -1021,7 +1025,12 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; - unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + unsigned Opc; + if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) + Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 + : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else + Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); @@ -1798,6 +1807,82 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, return true; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( + SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + KnownBits VKnown = CurDAG->computeKnownBits(VAddr); + KnownBits SKnown = KnownBits::computeForAddSub( + true, false, CurDAG->computeKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + +bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, + SDValue &VAddr, SDValue &SAddr, + SDValue &Offset) const { + int64_t ImmOffset = 0; + + SDValue LHS, RHS; + if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { + int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { + Addr = LHS; + ImmOffset = COffsetVal; + } else if (!LHS->isDivergent() && COffsetVal > 0) { + SDLoc SL(N); + // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) + = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true); + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VAddr = SDValue(VMov, 0); + SAddr = LHS; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) + return false; + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + return true; + } + } + } + + if (Addr.getOpcode() != ISD::ADD) + return false; + + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); + + if (!LHS->isDivergent() && RHS->isDivergent()) { + SAddr = LHS; + VAddr = RHS; + } else if (!RHS->isDivergent() && LHS->isDivergent()) { + SAddr = RHS; + VAddr = LHS; + } else { + return false; + } + + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) + return false; + SAddr = SelectSAddrFI(CurDAG, SAddr); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; +} + bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); @@ -2224,70 +2309,6 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { } } -// This is here because there isn't a way to use the generated sub0_sub1 as the -// subreg index to EXTRACT_SUBREG in tablegen. -void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { - MemSDNode *Mem = cast<MemSDNode>(N); - unsigned AS = Mem->getAddressSpace(); - if (AS == AMDGPUAS::FLAT_ADDRESS) { - SelectCode(N); - return; - } - - MVT VT = N->getSimpleValueType(0); - bool Is32 = (VT == MVT::i32); - SDLoc SL(N); - - MachineSDNode *CmpSwap = nullptr; - if (Subtarget->hasAddr64()) { - SDValue SRsrc, VAddr, SOffset, Offset; - - if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; - SDValue CmpVal = Mem->getOperand(2); - SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); - - // XXX - Do we care about glue operands? - - SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol, - Mem->getChain()}; - - CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); - } - } - - if (!CmpSwap) { - SDValue SRsrc, SOffset, Offset; - if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; - - SDValue CmpVal = Mem->getOperand(2); - SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); - SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()}; - - CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); - } - } - - if (!CmpSwap) { - SelectCode(N); - return; - } - - MachineMemOperand *MMO = Mem->getMemOperand(); - CurDAG->setNodeMemRefs(CmpSwap, {MMO}); - - unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - SDValue Extract - = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); - - ReplaceUses(SDValue(N, 0), Extract); - ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); - CurDAG->RemoveDeadNode(N); -} - void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { // The address is assumed to be uniform, so if it ends up in a VGPR, it will // be copied to an SGPR with readfirstlane. @@ -2587,6 +2608,30 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { return true; } +bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src, + SDValue &SrcMods, + bool OpSel) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (OpSel) + Mods |= SISrcMods::OP_SEL_0; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false); +} + +bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { @@ -2619,7 +2664,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, } bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { + SDValue &SrcMods, bool IsDOT) const { unsigned Mods = 0; Src = In; @@ -2628,7 +2673,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::BUILD_VECTOR) { + if (Src.getOpcode() == ISD::BUILD_VECTOR && + (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { unsigned VecMods = Mods; SDValue Lo = stripBitcast(Src.getOperand(0)); @@ -2716,6 +2762,40 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVOP3PMods(In, Src, SrcMods, true); +} + +bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // 1 promotes packed values to signed, 0 treats them as unsigned. + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcSign = C->getAPIntValue().getZExtValue(); + if (SrcSign == 1) + Mods ^= SISrcMods::NEG; + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, + SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcVal = C->getAPIntValue().getZExtValue(); + if (SrcVal == 1) + Mods |= SISrcMods::OP_SEL_0; + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; @@ -2840,7 +2920,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { } } } - // If "AllUsesAcceptSReg == false" so far we haven't suceeded + // If "AllUsesAcceptSReg == false" so far we haven't succeeded // commuting current user. This means have at least one use // that strictly require VGPR. Thus, we will not attempt to commute // other user instructions. @@ -2854,26 +2934,15 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { auto Ld = cast<LoadSDNode>(N); - return Ld->getAlignment() >= 4 && - ( - ( - ( - Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT - ) - && - !N->isDivergent() - ) - || - ( - Subtarget->getScalarizeGlobalBehavior() && - Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - Ld->isSimple() && - !N->isDivergent() && - static_cast<const SITargetLowering *>( - getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) - ) - ); + return Ld->getAlign() >= Align(4) && + (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + !N->isDivergent()) || + (Subtarget->getScalarizeGlobalBehavior() && + Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + Ld->isSimple() && !N->isDivergent() && + static_cast<const SITargetLowering *>(getTargetLowering()) + ->isMemOpHasNoClobberedMemOperand(N))); } void AMDGPUDAGToDAGISel::PostprocessISelDAG() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index d638d9877a9b..862be9dc5568 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -188,6 +188,10 @@ private: SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; + bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, + uint64_t ImmOffset) const; + bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &SAddr, SDValue &Offset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -214,10 +218,20 @@ private: bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods, + bool OpSel) const; + bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, + bool IsDOT = false) const; + bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + + bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -245,7 +259,6 @@ private: bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); - void SelectATOMIC_CMP_SWAP(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b9d0655feef7..ef7929012597 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -19,6 +19,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/CommandLine.h" @@ -127,49 +128,27 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); - } + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT, + Expand); for (MVT VT : MVT::integer_valuetypes()) { if (VT == MVT::i64) continue; - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) { + setLoadExtAction(Op, VT, MVT::i1, Promote); + setLoadExtAction(Op, VT, MVT::i8, Legal); + setLoadExtAction(Op, VT, MVT::i16, Legal); + setLoadExtAction(Op, VT, MVT::i32, Expand); + } } - for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); - } + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + for (auto MemVT : + {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16}) + setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT, + Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); @@ -304,229 +283,125 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal); + setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); // This is totally unsupported, just custom lower to produce an error. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); // Library functions. These default to Expand, but we have instructions // for them. - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FEXP2, MVT::f32, Legal); - setOperationAction(ISD::FPOW, MVT::f32, Legal); - setOperationAction(ISD::FLOG2, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS, + ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, + ISD::FMAXNUM}, + MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Custom); - setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); - setOperationAction(ISD::FLOG, MVT::f32, Custom); - setOperationAction(ISD::FLOG10, MVT::f32, Custom); - setOperationAction(ISD::FEXP, MVT::f32, Custom); + setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, {MVT::f32, MVT::f64}, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); - - setOperationAction(ISD::FREM, MVT::f16, Custom); - setOperationAction(ISD::FREM, MVT::f32, Custom); - setOperationAction(ISD::FREM, MVT::f64, Custom); + setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, + {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, + MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, + MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32}, + Custom); + setOperationAction( + ISD::EXTRACT_SUBVECTOR, + {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32, + MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32, + MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32, + MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16, + MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, + MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, + MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, + Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); - setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { // These should use [SU]DIVREM, so set them to expand - setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::UREM, VT, Expand); + setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT, + Expand); // GPU does not have divrem function for signed or unsigned. - setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom); // GPU does not have [S|U]MUL_LOHI functions as a single instruction. - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand); - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand); // AMDGPU uses ADDC/SUBC/ADDE/SUBE - setOperationAction(ISD::ADDC, VT, Legal); - setOperationAction(ISD::SUBC, VT, Legal); - setOperationAction(ISD::ADDE, VT, Legal); - setOperationAction(ISD::SUBE, VT, Legal); + setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); } // The hardware supports 32-bit FSHR, but not FSHL. setOperationAction(ISD::FSHR, MVT::i32, Legal); // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); setOperationAction(ISD::ROTR, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i16, Expand); - setOperationAction(ISD::MULHS, MVT::i16, Expand); + setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); + setOperationAction( + {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, + MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::SMIN, MVT::i32, Legal); - setOperationAction(ISD::UMIN, MVT::i32, Legal); - setOperationAction(ISD::SMAX, MVT::i32, Legal); - setOperationAction(ISD::UMAX, MVT::i32, Legal); + setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, + Legal); - setOperationAction(ISD::CTTZ, MVT::i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); - setOperationAction(ISD::CTLZ, MVT::i64, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + setOperationAction( + {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, + MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32}; for (MVT VT : VectorIntTypes) { // Expand the following operations for the current type by default. - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::AND, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT, Expand); - setOperationAction(ISD::FP_TO_UINT, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); - setOperationAction(ISD::MULHU, VT, Expand); - setOperationAction(ISD::MULHS, VT, Expand); - setOperationAction(ISD::OR, VT, Expand); - setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SRA, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); - setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::UINT_TO_FP, VT, Expand); - setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Expand); - setOperationAction(ISD::UDIVREM, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::XOR, VT, Expand); - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, + ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, + ISD::MULHS, ISD::OR, ISD::SHL, + ISD::SRA, ISD::SRL, ISD::ROTL, + ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, + ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, + ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, + ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, + ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, + ISD::XOR, ISD::BSWAP, ISD::CTPOP, + ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, + ISD::SETCC}, + VT, Expand); } static const MVT::SimpleValueType FloatVectorTypes[] = { MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32}; for (MVT VT : FloatVectorTypes) { - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FMINNUM, VT, Expand); - setOperationAction(ISD::FMAXNUM, VT, Expand); - setOperationAction(ISD::FADD, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FDIV, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); - setOperationAction(ISD::FEXP, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FLOG, VT, Expand); - setOperationAction(ISD::FLOG10, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FMUL, VT, Expand); - setOperationAction(ISD::FMA, VT, Expand); - setOperationAction(ISD::FRINT, VT, Expand); - setOperationAction(ISD::FNEARBYINT, VT, Expand); - setOperationAction(ISD::FSQRT, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSUB, VT, Expand); - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - setOperationAction(ISD::SETCC, VT, Expand); - setOperationAction(ISD::FCANONICALIZE, VT, Expand); + setOperationAction( + {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD, + ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2, + ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG, + ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC, + ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, + ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG, + ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, + ISD::SETCC, ISD::FCANONICALIZE}, + VT, Expand); } // This causes using an unrolled select operation rather than expansion with @@ -590,26 +465,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, if (AMDGPUBypassSlowDiv) addBypassSlowDiv(64, 32); - setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SRL); - setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SMUL_LOHI); - setTargetDAGCombine(ISD::UMUL_LOHI); - setTargetDAGCombine(ISD::MULHU); - setTargetDAGCombine(ISD::MULHS); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FNEG); - setTargetDAGCombine(ISD::FABS); - setTargetDAGCombine(ISD::AssertZext); - setTargetDAGCombine(ISD::AssertSext); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine({ISD::BITCAST, ISD::SHL, + ISD::SRA, ISD::SRL, + ISD::TRUNCATE, ISD::MUL, + ISD::SMUL_LOHI, ISD::UMUL_LOHI, + ISD::MULHU, ISD::MULHS, + ISD::SELECT, ISD::SELECT_CC, + ISD::STORE, ISD::FADD, + ISD::FSUB, ISD::FNEG, + ISD::FABS, ISD::AssertZext, + ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); } bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { @@ -785,11 +650,11 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, unsigned AS = MN->getAddressSpace(); // Do not shrink an aligned scalar load to sub-dword. // Scalar engine cannot do sub-dword loads. - if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 && + if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - (isa<LoadSDNode>(N) && - AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) && + (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS && + MN->isInvariant())) && AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) return false; @@ -855,6 +720,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; return false; + case AMDGPUISD::SETCC: // ballot-style instruction + return true; } return false; } @@ -1072,10 +939,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const bool IsByRef = Arg.hasByRefAttr(); Type *BaseArgTy = Arg.getType(); Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; - MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; - if (!Alignment) - Alignment = DL.getABITypeAlign(MemArgTy); - MaxAlign = max(Alignment, MaxAlign); + Align Alignment = DL.getValueOrABITypeAlignment( + IsByRef ? Arg.getParamAlign() : None, MemArgTy); + MaxAlign = std::max(Alignment, MaxAlign); uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; @@ -1415,6 +1281,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, (Start == 0 || Start == 4)) return Op; + if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) || + (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) && + (Start == 0 || Start == 8)) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); @@ -1589,8 +1460,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); unsigned Size = LoMemVT.getStoreSize(); - unsigned BaseAlign = Load->getAlignment(); - unsigned HiAlign = MinAlign(BaseAlign, Size); + Align BaseAlign = Load->getAlign(); + Align HiAlign = commonAlignment(BaseAlign, Size); SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, @@ -1628,13 +1499,13 @@ SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); - unsigned BaseAlign = Load->getAlignment(); + Align BaseAlign = Load->getAlign(); unsigned NumElements = MemVT.getVectorNumElements(); // Widen from vec3 to vec4 when the load is at least 8-byte aligned // or 16-byte fully dereferenceable. Otherwise, split the vector load. if (NumElements != 3 || - (BaseAlign < 8 && + (BaseAlign < Align(8) && !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) return SplitVectorLoad(Op, DAG); @@ -1681,9 +1552,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); - unsigned BaseAlign = Store->getAlignment(); + Align BaseAlign = Store->getAlign(); unsigned Size = LoMemVT.getStoreSize(); - unsigned HiAlign = MinAlign(BaseAlign, Size); + Align HiAlign = commonAlignment(BaseAlign, Size); SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, @@ -3003,12 +2874,11 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, // the bytes again are not eliminated in the case of an unaligned copy. if (!allowsMisalignedMemoryAccesses( VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { - SDValue Ops[2]; - if (VT.isVector()) - std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG); - else - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return SplitVectorLoad(SDValue(LN, 0), DAG); + + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); return DAG.getMergeValues(Ops, SDLoc(N)); } @@ -3059,7 +2929,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, if (!allowsMisalignedMemoryAccesses( VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { if (VT.isVector()) - return scalarizeVectorStore(SN, DAG); + return SplitVectorStore(SDValue(SN, 0), DAG); return expandUnalignedStore(SN, DAG); } @@ -3281,8 +3151,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, // this improves the ability to match BFE patterns in isel. if (LHS.getOpcode() == ISD::AND) { if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { - if (Mask->getAPIntValue().isShiftedMask() && - Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) { + unsigned MaskIdx, MaskLen; + if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && + MaskIdx == ShiftAmt) { return DAG.getNode( ISD::AND, SL, VT, DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), @@ -4380,10 +4251,14 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + ExplicitArgOffset; switch (Param) { - case GRID_DIM: + case FIRST_IMPLICIT: return ArgOffset; - case GRID_OFFSET: - return ArgOffset + 4; + case PRIVATE_BASE: + return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; + case SHARED_BASE: + return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; + case QUEUE_PTR: + return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; } llvm_unreachable("unexpected implicit parameter type"); } @@ -4405,7 +4280,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) - NODE_NAME_CASE(RET_GFX_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) NODE_NAME_CASE(DWORDADDR) @@ -4485,6 +4359,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) + NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD) + NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(LOAD_D16_HI) @@ -4580,6 +4456,19 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, return SDValue(); } +static unsigned workitemIntrinsicDim(unsigned ID) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + return 0; + case Intrinsic::amdgcn_workitem_id_y: + return 1; + case Intrinsic::amdgcn_workitem_id_z: + return 2; + default: + llvm_unreachable("not a workitem intrinsic"); + } +} + void AMDGPUTargetLowering::computeKnownBitsForTargetNode( const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { @@ -4716,6 +4605,14 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); break; } + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: { + unsigned MaxValue = Subtarget->getMaxWorkitemID( + DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); + Known.Zero.setHighBits(countLeadingZeros(MaxValue)); + break; + } default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index b41506157b68..73081483f1c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -320,8 +320,9 @@ public: enum ImplicitParameter { FIRST_IMPLICIT, - GRID_DIM = FIRST_IMPLICIT, - GRID_OFFSET, + PRIVATE_BASE, + SHARED_BASE, + QUEUE_PTR, }; /// Helper function that returns the byte offset of the given @@ -367,9 +368,6 @@ enum NodeType : unsigned { // Return with values from a non-entry function. RET_FLAG, - // Return with values from a non-entry function (AMDGPU_Gfx CC). - RET_GFX_FLAG, - DWORDADDR, FRACT, @@ -483,6 +481,9 @@ enum NodeType : unsigned { CONST_DATA_PTR, PC_ADD_REL_OFFSET, LDS, + FPTRUNC_ROUND_UPWARD, + FPTRUNC_ROUND_DOWNWARD, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp new file mode 100644 index 000000000000..c9cdbc89f3a4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -0,0 +1,457 @@ +//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_delay_alu instructions to avoid stalls on GFX11+. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-insert-delay-alu" + +namespace { + +class AMDGPUInsertDelayAlu : public MachineFunctionPass { +public: + static char ID; + + const SIInstrInfo *SII; + const TargetRegisterInfo *TRI; + + TargetSchedModel SchedModel; + + AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Return true if MI waits for all outstanding VALU instructions to complete. + static bool instructionWaitsForVALU(const MachineInstr &MI) { + // These instruction types wait for VA_VDST==0 before issuing. + const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | + SIInstrFlags::FLAT | SIInstrFlags::MIMG | + SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; + if (MI.getDesc().TSFlags & VA_VDST_0) + return true; + if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || + MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) + return true; + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI.getOperand(0).getImm() & 0xf000) == 0) + return true; + return false; + } + + // Types of delay that can be encoded in an s_delay_alu instruction. + enum DelayType { VALU, TRANS, SALU, OTHER }; + + // Get the delay type for an instruction with the specified TSFlags. + static DelayType getDelayType(uint64_t TSFlags) { + if (TSFlags & SIInstrFlags::TRANS) + return TRANS; + if (TSFlags & SIInstrFlags::VALU) + return VALU; + if (TSFlags & SIInstrFlags::SALU) + return SALU; + return OTHER; + } + + // Information about the last instruction(s) that wrote to a particular + // regunit. In straight-line code there will only be one such instruction, but + // when control flow converges we merge the delay information from each path + // to represent the union of the worst-case delays of each type. + struct DelayInfo { + // One larger than the maximum number of (non-TRANS) VALU instructions we + // can encode in an s_delay_alu instruction. + static const unsigned VALU_MAX = 5; + + // One larger than the maximum number of TRANS instructions we can encode in + // an s_delay_alu instruction. + static const unsigned TRANS_MAX = 4; + + // If it was written by a (non-TRANS) VALU, remember how many clock cycles + // are left until it completes, and how many other (non-TRANS) VALU we have + // seen since it was issued. + uint8_t VALUCycles = 0; + uint8_t VALUNum = VALU_MAX; + + // If it was written by a TRANS, remember how many clock cycles are left + // until it completes, and how many other TRANS we have seen since it was + // issued. + uint8_t TRANSCycles = 0; + uint8_t TRANSNum = TRANS_MAX; + // Also remember how many other (non-TRANS) VALU we have seen since it was + // issued. When an instruction depends on both a prior TRANS and a prior + // non-TRANS VALU, this is used to decide whether to encode a wait for just + // one or both of them. + uint8_t TRANSNumVALU = VALU_MAX; + + // If it was written by an SALU, remember how many clock cycles are left + // until it completes. + uint8_t SALUCycles = 0; + + DelayInfo() = default; + + DelayInfo(DelayType Type, unsigned Cycles) { + switch (Type) { + default: + llvm_unreachable("unexpected type"); + case VALU: + VALUCycles = Cycles; + VALUNum = 0; + break; + case TRANS: + TRANSCycles = Cycles; + TRANSNum = 0; + TRANSNumVALU = 0; + break; + case SALU: + SALUCycles = Cycles; + break; + } + } + + bool operator==(const DelayInfo &RHS) const { + return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && + TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && + TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; + } + + bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } + + // Merge another DelayInfo into this one, to represent the union of the + // worst-case delays of each type. + void merge(const DelayInfo &RHS) { + VALUCycles = std::max(VALUCycles, RHS.VALUCycles); + VALUNum = std::min(VALUNum, RHS.VALUNum); + TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); + TRANSNum = std::min(TRANSNum, RHS.TRANSNum); + TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); + SALUCycles = std::max(SALUCycles, RHS.SALUCycles); + } + + // Update this DelayInfo after issuing an instruction. IsVALU should be 1 + // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing + // a TRANS, else 0. Cycles is the number of cycles it takes to issue the + // instruction. Return true if there is no longer any useful delay info. + bool advance(DelayType Type, unsigned Cycles) { + bool Erase = true; + + VALUNum += (Type == VALU); + if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { + // Forget about the VALU instruction. It was too far back or has + // definitely completed by now. + VALUNum = VALU_MAX; + VALUCycles = 0; + } else { + VALUCycles -= Cycles; + Erase = false; + } + + TRANSNum += (Type == TRANS); + TRANSNumVALU += (Type == VALU); + if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { + // Forget about any TRANS instruction. It was too far back or has + // definitely completed by now. + TRANSNum = TRANS_MAX; + TRANSNumVALU = VALU_MAX; + TRANSCycles = 0; + } else { + TRANSCycles -= Cycles; + Erase = false; + } + + if (SALUCycles <= Cycles) { + // Forget about any SALU instruction. It has definitely completed by + // now. + SALUCycles = 0; + } else { + SALUCycles -= Cycles; + Erase = false; + } + + return Erase; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const { + if (VALUCycles) + dbgs() << " VALUCycles=" << (int)VALUCycles; + if (VALUNum < VALU_MAX) + dbgs() << " VALUNum=" << (int)VALUNum; + if (TRANSCycles) + dbgs() << " TRANSCycles=" << (int)TRANSCycles; + if (TRANSNum < TRANS_MAX) + dbgs() << " TRANSNum=" << (int)TRANSNum; + if (TRANSNumVALU < VALU_MAX) + dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; + if (SALUCycles) + dbgs() << " SALUCycles=" << (int)SALUCycles; + } +#endif + }; + + // A map from regunits to the delay info for that regunit. + struct DelayState : DenseMap<unsigned, DelayInfo> { + // Merge another DelayState into this one by merging the delay info for each + // regunit. + void merge(const DelayState &RHS) { + for (const auto &KV : RHS) { + iterator It; + bool Inserted; + std::tie(It, Inserted) = insert(KV); + if (!Inserted) + It->second.merge(KV.second); + } + } + + // Advance the delay info for each regunit, erasing any that are no longer + // useful. + void advance(DelayType Type, unsigned Cycles) { + iterator Next; + for (auto I = begin(), E = end(); I != E; I = Next) { + Next = std::next(I); + if (I->second.advance(Type, Cycles)) + erase(I); + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const TargetRegisterInfo *TRI) const { + if (empty()) { + dbgs() << " empty\n"; + return; + } + + // Dump DelayInfo for each RegUnit in numerical order. + SmallVector<const_iterator, 8> Order; + Order.reserve(size()); + for (const_iterator I = begin(), E = end(); I != E; ++I) + Order.push_back(I); + llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { + return A->first < B->first; + }); + for (const_iterator I : Order) { + dbgs() << " " << printRegUnit(I->first, TRI); + I->second.dump(); + dbgs() << "\n"; + } + } +#endif + }; + + // The saved delay state at the end of each basic block. + DenseMap<MachineBasicBlock *, DelayState> BlockState; + + // Emit an s_delay_alu instruction if necessary before MI. + MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, + MachineInstr *LastDelayAlu) { + unsigned Imm = 0; + + // Wait for a TRANS instruction. + if (Delay.TRANSNum < DelayInfo::TRANS_MAX) + Imm |= 4 + Delay.TRANSNum; + + // Wait for a VALU instruction (if it's more recent than any TRANS + // instruction that we're also waiting for). + if (Delay.VALUNum < DelayInfo::VALU_MAX && + Delay.VALUNum <= Delay.TRANSNumVALU) { + if (Imm & 0xf) + Imm |= Delay.VALUNum << 7; + else + Imm |= Delay.VALUNum; + } + + // Wait for an SALU instruction. + if (Delay.SALUCycles) { + if (Imm & 0x780) { + // We have already encoded a VALU and a TRANS delay. There's no room in + // the encoding for an SALU delay as well, so just drop it. + } else if (Imm & 0xf) { + Imm |= (Delay.SALUCycles + 8) << 7; + } else { + Imm |= Delay.SALUCycles + 8; + } + } + + // Don't emit the s_delay_alu instruction if there's nothing to wait for. + if (!Imm) + return LastDelayAlu; + + // If we only need to wait for one instruction, try encoding it in the last + // s_delay_alu that we emitted. + if (!(Imm & 0x780) && LastDelayAlu) { + unsigned Skip = 0; + for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), + E = MachineBasicBlock::instr_iterator(MI); + ++I != E;) { + if (!I->isBundle() && !I->isMetaInstruction()) + ++Skip; + } + if (Skip < 6) { + MachineOperand &Op = LastDelayAlu->getOperand(0); + unsigned LastImm = Op.getImm(); + assert((LastImm & ~0xf) == 0 && + "Remembered an s_delay_alu with no room for another delay!"); + LastImm |= Imm << 7 | Skip << 4; + Op.setImm(LastImm); + return nullptr; + } + } + + auto &MBB = *MI.getParent(); + MachineInstr *DelayAlu = + BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); + // Remember the s_delay_alu for next time if there is still room in it to + // encode another delay. + return (Imm & 0x780) ? nullptr : DelayAlu; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { + DelayState State; + for (auto *Pred : MBB.predecessors()) + State.merge(BlockState[Pred]); + + LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) + << "\n"; + State.dump(TRI);); + + bool Changed = false; + MachineInstr *LastDelayAlu = nullptr; + + // Iterate over the contents of bundles, but don't emit any instructions + // inside a bundle. + for (auto &MI : MBB.instrs()) { + if (MI.isBundle() || MI.isMetaInstruction()) + continue; + + // Ignore some more instructions that do not generate any code. + switch (MI.getOpcode()) { + case AMDGPU::SI_RETURN_TO_EPILOG: + continue; + } + + DelayType Type = getDelayType(MI.getDesc().TSFlags); + + if (instructionWaitsForVALU(MI)) { + // Forget about all outstanding VALU delays. + State = DelayState(); + } else if (Type != OTHER) { + DelayInfo Delay; + // TODO: Scan implicit uses too? + for (const auto &Op : MI.explicit_uses()) { + if (Op.isReg()) { + // One of the operands of the writelane is also the output operand. + // This creates the insertion of redundant delays. Hence, we have to + // ignore this operand. + if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) + continue; + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { + auto It = State.find(*UI); + if (It != State.end()) { + Delay.merge(It->second); + State.erase(*UI); + } + } + } + } + if (Emit && !MI.isBundledWithPred()) { + // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or + // just ignore them? + LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); + } + } + + if (Type != OTHER) { + // TODO: Scan implicit defs too? + for (const auto &Op : MI.defs()) { + unsigned Latency = SchedModel.computeOperandLatency( + &MI, MI.getOperandNo(&Op), nullptr, 0); + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) + State[*UI] = DelayInfo(Type, Latency); + } + } + + // Advance by the number of cycles it takes to issue this instruction. + // TODO: Use a more advanced model that accounts for instructions that + // take multiple cycles to issue on a particular pipeline. + unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); + // TODO: In wave64 mode, double the number of cycles for VALU and VMEM + // instructions on the assumption that they will usually have to be issued + // twice? + State.advance(Type, Cycles); + + LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); + } + + if (Emit) { + assert(State == BlockState[&MBB] && + "Basic block state should not have changed on final pass!"); + } else if (State != BlockState[&MBB]) { + BlockState[&MBB] = std::move(State); + Changed = true; + } + return Changed; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() + << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasDelayAlu()) + return false; + + SII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + SchedModel.init(&ST); + + // Calculate the delay state for each basic block, iterating until we reach + // a fixed point. + SetVector<MachineBasicBlock *> WorkList; + for (auto &MBB : reverse(MF)) + WorkList.insert(&MBB); + while (!WorkList.empty()) { + auto &MBB = *WorkList.pop_back_val(); + bool Changed = runOnMachineBasicBlock(MBB, false); + if (Changed) + WorkList.insert(MBB.succ_begin(), MBB.succ_end()); + } + + LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); + + // Make one last pass over all basic blocks to emit s_delay_alu + // instructions. + bool Changed = false; + for (auto &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB, true); + return Changed; + } +}; + +} // namespace + +char AMDGPUInsertDelayAlu::ID = 0; + +char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; + +INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", + false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 4f1d700bcd84..695093322a01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -110,33 +110,42 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { llvm_unreachable("Should never be called!"); } -/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with -/// the modified arguments. +/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with +/// modified arguments (based on OldIntr) and replaces InstToReplace with +/// this newly created intrinsic call. static Optional<Instruction *> modifyIntrinsicCall( - IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, + IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, + InstCombiner &IC, std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> Func) { SmallVector<Type *, 4> ArgTys; - if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) + if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) return None; - SmallVector<Value *, 8> Args(II.args()); + SmallVector<Value *, 8> Args(OldIntr.args()); // Modify arguments and types Func(Args, ArgTys); - Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); + Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); CallInst *NewCall = IC.Builder.CreateCall(I, Args); - NewCall->takeName(&II); - NewCall->copyMetadata(II); + NewCall->takeName(&OldIntr); + NewCall->copyMetadata(OldIntr); if (isa<FPMathOperator>(NewCall)) - NewCall->copyFastMathFlags(&II); + NewCall->copyFastMathFlags(&OldIntr); // Erase and replace uses - if (!II.getType()->isVoidTy()) - IC.replaceInstUsesWith(II, NewCall); - return IC.eraseInstFromFunction(II); + if (!InstToReplace.getType()->isVoidTy()) + IC.replaceInstUsesWith(InstToReplace, NewCall); + + bool RemoveOldIntr = &OldIntr != &InstToReplace; + + auto RetValue = IC.eraseInstFromFunction(InstToReplace); + if (RemoveOldIntr) + IC.eraseInstFromFunction(OldIntr); + + return RetValue; } static Optional<Instruction *> @@ -153,7 +162,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, ImageDimIntr->Dim); return modifyIntrinsicCall( - II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { Args.erase(Args.begin() + ImageDimIntr->LodIndex); }); } @@ -170,7 +179,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, ImageDimIntr->Dim); return modifyIntrinsicCall( - II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { Args.erase(Args.begin() + ImageDimIntr->MipIndex); }); } @@ -187,7 +196,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, ImageDimIntr->Dim); return modifyIntrinsicCall( - II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { Args.erase(Args.begin() + ImageDimIntr->BiasIndex); ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); }); @@ -205,13 +214,41 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, AMDGPU::getImageDimIntrinsicByBaseOpcode( OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); return modifyIntrinsicCall( - II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); }); } } } + // Try to use D16 + if (ST->hasD16Images()) { + + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); + + if (BaseOpcode->HasD16) { + + // If the only use of image intrinsic is a fptrunc (with conversion to + // half) then both fptrunc and image intrinsic will be replaced with image + // intrinsic with D16 flag. + if (II.hasOneUse()) { + Instruction *User = II.user_back(); + + if (User->getOpcode() == Instruction::FPTrunc && + User->getType()->getScalarType()->isHalfTy()) { + + return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, + [&](auto &Args, auto &ArgTys) { + // Change return type of image intrinsic. + // Set it to return type of fptrunc. + ArgTys[0] = User->getType(); + }); + } + } + } + } + // Try to use A16 or G16 if (!ST->hasA16() && !ST->hasG16()) return None; @@ -263,7 +300,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, : Type::getInt16Ty(II.getContext()); return modifyIntrinsicCall( - II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { + II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { ArgTys[ImageDimIntr->GradientTyArg] = CoordType; if (!OnlyDerivatives) { ArgTys[ImageDimIntr->CoordTyArg] = CoordType; @@ -584,6 +621,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, RightShift); } case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_row: case Intrinsic::amdgcn_exp_compr: { ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); unsigned EnBits = En->getZExtValue(); @@ -882,6 +920,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); } + case Intrinsic::amdgcn_permlane64: + // A constant value is trivially uniform. + if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { + return IC.replaceInstUsesWith(II, C); + } + break; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { // A constant value is trivially uniform. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 391dc8428539..23b8fcf75f16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -355,11 +355,7 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] ->; - -def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, +def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b7d0f0580cda..3f242fdb6d8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -80,8 +81,11 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); if (RC) { const LLT Ty = MRI.getType(Reg); - return RC->hasSuperClassEq(TRI.getBoolRC()) && - Ty.isValid() && Ty.getSizeInBits() == 1; + if (!Ty.isValid() || Ty.getSizeInBits() != 1) + return false; + // G_TRUNC s1 result is never vcc. + return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && + RC->hasSuperClassEq(TRI.getBoolRC()); } const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); @@ -91,7 +95,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const { MI.setDesc(TII.get(NewOpc)); - MI.RemoveOperand(1); // Remove intrinsic ID. + MI.removeOperand(1); // Remove intrinsic ID. MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); MachineOperand &Dst = MI.getOperand(0); @@ -216,7 +220,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); - DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI); + DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); return false; @@ -454,6 +458,24 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( return true; } +bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( + MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + + unsigned Opc; + if (Subtarget->getGeneration() == AMDGPUSubtarget::GFX11) + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 + : AMDGPU::V_MAD_I64_I32_gfx11_e64; + else + Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; + I.setDesc(TII.get(Opc)); + I.addOperand(*MF, MachineOperand::CreateImm(0)); + I.addImplicitDefUseOperands(*MF); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + // TODO: We should probably legalize these to only using 32-bit results. bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); @@ -481,7 +503,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); if (!SrcRC) return false; unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, @@ -514,7 +536,7 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const unsigned DstSize = DstTy.getSizeInBits(); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); if (!DstRC) return false; @@ -556,7 +578,7 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); const TargetRegisterClass *SrcRC = - TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); + TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) return false; @@ -630,7 +652,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { MI.setDesc(TII.get(AMDGPU::COPY)); - MI.RemoveOperand(2); + MI.removeOperand(2); return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); } @@ -643,6 +665,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( // // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) // => (S_PACK_HH_B32_B16 $src0, $src1) + // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1) + // => (S_PACK_HL_B32_B16 $src0, $src1) // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) // => (S_PACK_LH_B32_B16 $src0, $src1) // (build_vector_trunc $src0, $src1) @@ -662,14 +686,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { - // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 - auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) - .addReg(ShiftSrc0) - .addImm(16); + } else if (Shift0) { + if (ConstSrc1 && ConstSrc1->Value == 0) { + // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 + auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) + .addReg(ShiftSrc0) + .addImm(16); - MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + } + if (STI.hasSPackHL()) { + Opc = AMDGPU::S_PACK_HL_B32_B16; + MI.getOperand(1).setReg(ShiftSrc0); + } } MI.setDesc(TII.get(Opc)); @@ -722,16 +752,16 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); if (!DstRC) return false; const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); const TargetRegisterClass *Src0RC = - TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); const TargetRegisterClass *Src1RC = - TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); + TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); // Deal with weird cases where the class only partially supports the subreg // index. @@ -970,6 +1000,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectGroupStaticSize(I); case Intrinsic::returnaddress: return selectReturnAddress(I); + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + return selectSMFMACIntrin(I); default: return selectImpl(I, *CoverageInfo); } @@ -1142,7 +1179,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { Optional<ValueAndVReg> Arg = getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); - if (Arg.hasValue()) { + if (Arg) { const int64_t Value = Arg.getValue().Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; @@ -1164,8 +1201,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); - const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); + const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; @@ -1300,12 +1336,14 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); unsigned Offset0 = OrderedCountIndex << 2; - unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | - (Instruction << 4); + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) Offset1 |= (CountDw - 1) << 6; + if (STI.getGeneration() < AMDGPUSubtarget::GFX11) + Offset1 |= ShaderType << 2; + unsigned Offset = Offset0 | (Offset1 << 8); Register M0Val = MI.getOperand(2).getReg(); @@ -1424,23 +1462,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, if (HasVSrc) { Register VSrc = MI.getOperand(1).getReg(); - - if (STI.needsAlignedVGPRs()) { - // Add implicit aligned super-reg to force alignment on the data operand. - Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); - Register NewVR = - MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass); - BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR) - .addReg(VSrc, 0, MI.getOperand(1).getSubReg()) - .addImm(AMDGPU::sub0) - .addReg(Undef) - .addImm(AMDGPU::sub1); - MIB.addReg(NewVR, 0, AMDGPU::sub0); - MIB.addReg(NewVR, RegState::Implicit); - } else { - MIB.addReg(VSrc); - } + MIB.addReg(VSrc); if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) return false; @@ -1449,6 +1471,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, MIB.addImm(ImmOffset) .cloneMemRefs(MI); + TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); + MI.eraseFromParent(); return true; } @@ -1523,6 +1547,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; @@ -1627,7 +1652,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } // The legalizer preprocessed the intrinsic arguments. If we aren't using - // NSA, these should have beeen packed into a single value in the first + // NSA, these should have been packed into a single value in the first // address register const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { @@ -1639,13 +1664,29 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( ++NumVDataDwords; int Opcode = -1; - if (IsGFX10Plus) { + if (IsGFX11Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx11Default, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); } else { - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->hasGFX90AInsts()) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) { + LLVM_DEBUG( + dbgs() + << "requested image instruction is not supported on this GPU\n"); + return false; + } + } + if (Opcode == -1 && + STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) @@ -1703,7 +1744,13 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (IsGFX10Plus) MIB.addImm(IsA16 ? -1 : 0); - MIB.addImm(TFE); // tfe + if (!Subtarget->hasGFX90AInsts()) { + MIB.addImm(TFE); // tfe + } else if (TFE) { + LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n"); + return false; + } + MIB.addImm(LWE); // lwe if (!IsGFX10Plus) MIB.addImm(DimInfo->DA ? -1 : 0); @@ -1743,7 +1790,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); + return true; } bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( @@ -1770,10 +1819,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); - default: { - return selectImpl(I, *CoverageInfo); - } + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: + return selectBufferLoadLds(I); + case Intrinsic::amdgcn_global_load_lds: + return selectGlobalLoadLds(I); + case Intrinsic::amdgcn_exp_compr: + if (!STI.hasCompressedExport()) { + Function &F = I.getMF()->getFunction(); + DiagnosticInfoUnsupported NoFpRet( + F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + break; } + return selectImpl(I, *CoverageInfo); } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { @@ -1872,10 +1933,10 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); - const TargetRegisterClass *SrcRC - = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); - const TargetRegisterClass *DstRC - = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstRB); if (!SrcRC || !DstRC) return false; @@ -2014,10 +2075,10 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return selectCOPY(I); const TargetRegisterClass *SrcRC = - TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI); + TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); const TargetRegisterClass *DstRC = - TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + TRI.getRegClassForSizeOnBank(DstSize, *DstBank); Register UndefReg = MRI->createVirtualRegister(SrcRC); BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); @@ -2384,65 +2445,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( return selectImpl(I, *CoverageInfo); } -// TODO: No rtn optimization. -bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( - MachineInstr &MI) const { - Register PtrReg = MI.getOperand(1).getReg(); - const LLT PtrTy = MRI->getType(PtrReg); - if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - STI.useFlatForGlobal()) - return selectImpl(MI, *CoverageInfo); - - Register DstReg = MI.getOperand(0).getReg(); - const LLT Ty = MRI->getType(DstReg); - const bool Is64 = Ty.getSizeInBits() == 64; - const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - Register TmpReg = MRI->createVirtualRegister( - Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); - - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock *BB = MI.getParent(); - - Register VAddr, RSrcReg, SOffset; - int64_t Offset = 0; - - unsigned Opcode; - if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { - Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; - } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, - RSrcReg, SOffset, Offset)) { - Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; - } else - return selectImpl(MI, *CoverageInfo); - - auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) - .addReg(MI.getOperand(2).getReg()); - - if (VAddr) - MIB.addReg(VAddr); - - MIB.addReg(RSrcReg); - if (SOffset) - MIB.addReg(SOffset); - else - MIB.addImm(0); - - MIB.addImm(Offset); - MIB.addImm(AMDGPU::CPol::GLC); - MIB.cloneMemRefs(MI); - - BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) - .addReg(TmpReg, RegState::Kill, SubReg); - - MI.eraseFromParent(); - - MRI->setRegClass( - DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); -} - static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { if (Reg.isPhysical()) return false; @@ -2551,7 +2553,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { // Try to avoid emitting a bit operation when we only need to touch half of // the 64-bit pointer. - APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); + APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64); const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); @@ -2571,12 +2573,10 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { const TargetRegisterClass &RegRC = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; - const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, - *MRI); - const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, - *MRI); + const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); + const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); const TargetRegisterClass *MaskRC = - TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); + TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || @@ -2689,10 +2689,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( if (IdxRB->getID() != AMDGPU::SGPRRegBankID) return false; - const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB, - *MRI); - const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, - *MRI); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); + const TargetRegisterClass *DstRC = + TRI.getRegClassForTypeOnBank(DstTy, *DstRB); if (!SrcRC || !DstRC) return false; if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || @@ -2771,10 +2771,10 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( if (IdxRB->getID() != AMDGPU::SGPRRegBankID) return false; - const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, - *MRI); - const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, - *MRI); + const TargetRegisterClass *VecRC = + TRI.getRegClassForTypeOnBank(VecTy, *VecRB); + const TargetRegisterClass *ValRC = + TRI.getRegClassForTypeOnBank(ValTy, *ValRB); if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || @@ -2867,7 +2867,6 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( return false; assert(ShufMask.size() == 2); - assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2924,17 +2923,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( } } else if (Mask[0] == 0 && Mask[1] == 0) { if (IsVALU) { - // Write low half of the register into the high half. - MachineInstr *MovSDWA = - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) - .addImm(0) // $src0_modifiers - .addReg(SrcVec) // $src0 - .addImm(0) // $clamp - .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel - .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused - .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel - .addReg(SrcVec, RegState::Implicit); - MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + if (STI.hasSDWA()) { + // Write low half of the register into the high half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) + .addImm(0xFFFF) + .addReg(SrcVec); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg) + .addReg(TmpReg) + .addImm(16) + .addReg(TmpReg); + } } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) .addReg(SrcVec) @@ -2942,17 +2952,28 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( } } else if (Mask[0] == 1 && Mask[1] == 1) { if (IsVALU) { - // Write high half of the register into the low half. - MachineInstr *MovSDWA = - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) - .addImm(0) // $src0_modifiers - .addReg(SrcVec) // $src0 - .addImm(0) // $clamp - .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel - .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused - .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel - .addReg(SrcVec, RegState::Implicit); - MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + if (STI.hasSDWA()) { + // Write high half of the register into the low half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .addReg(SrcVec); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), DstReg) + .addReg(TmpReg) + .addImm(16) + .addReg(TmpReg); + } } else { BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) .addReg(SrcVec) @@ -2965,13 +2986,19 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( .addReg(SrcVec) .addImm(16); } else { - Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) - .addReg(SrcVec) - .addImm(16); - BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) - .addReg(TmpReg) - .addReg(SrcVec); + if (STI.hasSPackHL()) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HL_B32_B16), DstReg) + .addReg(SrcVec) + .addReg(SrcVec); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) + .addReg(SrcVec) + .addImm(16); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) + .addReg(TmpReg) + .addReg(SrcVec); + } } } else llvm_unreachable("all shuffle masks should be handled"); @@ -2982,13 +3009,15 @@ bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( MachineInstr &MI) const { - if (STI.hasGFX90AInsts()) + const Register DefReg = MI.getOperand(0).getReg(); + LLT DefTy = MRI->getType(DefReg); + if (AMDGPU::hasAtomicFaddRtnForTy(STI, DefTy)) return selectImpl(MI, *CoverageInfo); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + if (!MRI->use_nodbg_empty(DefReg)) { Function &F = MBB->getParent()->getFunction(); DiagnosticInfoUnsupported NoFpRet(F, "return versions of fp atomics not supported", @@ -3105,9 +3134,236 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == 9; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(4).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Optional<ValueAndVReg> MaybeVOffset = + getIConstantVRegValWithLookThrough(VOffset, *MRI); + const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); + + switch (Size) { + default: + return false; + case 1: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; + break; + case 2: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; + break; + case 4: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); + + if (HasVIndex && HasVOffset) { + Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); + BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) + .addReg(VIndex) + .addImm(AMDGPU::sub0) + .addReg(VOffset) + .addImm(AMDGPU::sub1); + + MIB.addReg(IdxReg); + } else if (HasVIndex) { + MIB.addReg(VIndex); + } else if (HasVOffset) { + MIB.addReg(VOffset); + } + + MIB.add(MI.getOperand(1)); // rsrc + MIB.add(MI.getOperand(5 + OpOffset)); // soffset + MIB.add(MI.getOperand(6 + OpOffset)); // imm offset + unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); + MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol + MIB.addImm((Aux >> 3) & 1); // swz + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + StorePtrI.V = nullptr; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), LoadMMO->getBaseAlign()); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + +/// Match a zero extend from a 32-bit value to 64-bits. +static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { + Register ZExtSrc; + if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) + const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return false; + + if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + return Def->getOperand(1).getReg(); + } + + return Register(); +} + +bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + switch (Size) { + default: + return false; + case 1: + Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; + break; + case 2: + Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; + break; + case 4: + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + Register Addr = MI.getOperand(1).getReg(); + Register VOffset; + // Try to split SAddr and VOffset. Global and LDS pointers share the same + // immediate offset, so we cannot use a regular SelectGlobalSAddr(). + if (!isSGPR(Addr)) { + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); + if (isSGPR(AddrDef->Reg)) { + Addr = AddrDef->Reg; + } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { + Register SAddr = + getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + if (SAddr && isSGPR(SAddr)) { + Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + Addr = SAddr; + VOffset = Off; + } + } + } + } + + if (isSGPR(Addr)) { + Opc = AMDGPU::getGlobalSaddrOp(Opc); + if (!VOffset) { + VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) + .addImm(0); + } + } + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) + .addReg(Addr); + + if (isSGPR(Addr)) + MIB.addReg(VOffset); + + MIB.add(MI.getOperand(4)) // offset + .add(MI.getOperand(5)); // cpol + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(4).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), Align(4)); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ MI.setDesc(TII.get(MI.getOperand(1).getImm())); - MI.RemoveOperand(1); + MI.removeOperand(1); + MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + return true; +} + +bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { + unsigned Opc; + switch (MI.getIntrinsicID()) { + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; + break; + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: + Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; + break; + default: + llvm_unreachable("unhandled smfmac intrinsic"); + } + + auto VDst_In = MI.getOperand(4); + + MI.setDesc(TII.get(Opc)); + MI.removeOperand(4); // VDst_In + MI.removeOperand(1); // Intrinsic ID + MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); return true; } @@ -3166,6 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_UADDE: case TargetOpcode::G_USUBE: return selectG_UADDO_USUBO_UADDE_USUBE(I); + case AMDGPU::G_AMDGPU_MAD_U64_U32: + case AMDGPU::G_AMDGPU_MAD_I64_I32: + return selectG_AMDGPU_MAD_64_32(I); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: case TargetOpcode::G_PTRTOINT: @@ -3226,8 +3485,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - return selectG_AMDGPU_ATOMIC_CMPXCHG(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_TRUNC: @@ -3286,9 +3543,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair<Register, unsigned> -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool AllowAbs) const { +std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( + MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const { Register Src = Root.getReg(); Register OrigSrc = Src; unsigned Mods = 0; @@ -3305,7 +3561,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, Mods |= SISrcMods::ABS; } - if (Mods != 0 && + if (OpSel) + Mods |= SISrcMods::OP_SEL_0; + + if ((Mods != 0 || ForceVGPR) && RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { MachineInstr *UseMI = Root.getParent(); @@ -3407,7 +3666,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; MachineInstr *MI = MRI.getVRegDef(Src); @@ -3421,6 +3680,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( } // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; @@ -3444,6 +3704,50 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= SISrcMods::NEG; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( + MachineOperand &Root) const { + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() != 0) + Mods |= SISrcMods::OP_SEL_0; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; unsigned Mods; @@ -3467,6 +3771,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ false, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /* AllowAbs */ false, + /* OpSel */ true, + /* ForceVGPR */ true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector<GEPInfo, 4> AddrInfo; getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); @@ -3594,24 +3928,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { }}; } -/// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { - Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); - - // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); - if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; - - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { - return Def->getOperand(1).getReg(); - } - - return Register(); -} - // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { @@ -3631,9 +3947,6 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { ImmOffset = ConstOffset; } else { auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); - if (!PtrBaseDef) - return None; - if (isSGPR(PtrBaseDef->Reg)) { if (ConstOffset > 0) { // Offset is too large. @@ -3679,11 +3992,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { } } - auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); - if (!AddrDef) - return None; - // Match the variable offset. + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { // Look through the SGPR->VGPR copy. Register SAddr = @@ -3749,9 +4059,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { } auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); - if (!AddrDef) - return None; - if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = AddrDef->MI->getOperand(1).getIndex(); return {{ @@ -3768,8 +4075,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); - if (LHSDef && RHSDef && - LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && isSGPR(RHSDef->Reg)) { int FI = LHSDef->MI->getOperand(1).getIndex(); MachineInstr &I = *Root.getParent(); @@ -3792,6 +4098,74 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { }}; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( + Register VAddr, Register SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + auto VKnown = KnownBits->getKnownBits(VAddr); + auto SKnown = KnownBits::computeForAddSub( + true, false, KnownBits->getKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { + Register Addr = Root.getReg(); + Register PtrBase; + int64_t ConstOffset; + int64_t ImmOffset = 0; + + // Match the immediate offset first, which canonically is moved as low as + // possible. + std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + + if (ConstOffset != 0 && + TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { + Addr = PtrBase; + ImmOffset = ConstOffset; + } + + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); + if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) + return None; + + Register RHS = AddrDef->MI->getOperand(2).getReg(); + if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) + return None; + + Register LHS = AddrDef->MI->getOperand(1).getReg(); + auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) + return None; + + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { + int FI = LHSDef->MI->getOperand(1).getIndex(); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; + } + + if (!isSGPR(LHS)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); @@ -3856,7 +4230,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MIB.addReg(Info->getScratchRSrcReg()); }, [=](MachineInstrBuilder &MIB) { // vaddr - if (FI.hasValue()) + if (FI) MIB.addFrameIndex(FI.getValue()); else MIB.addReg(VAddr); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 42095332d11a..22672ba59e76 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -97,6 +97,7 @@ private: bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; + bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; @@ -133,7 +134,6 @@ private: void initM0(MachineInstr &I) const; bool selectG_LOAD_STORE_ATOMICRMW(MachineInstr &I) const; - bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const; bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; bool selectG_GLOBAL_VALUE(MachineInstr &I) const; @@ -144,11 +144,15 @@ private: bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; + bool selectBufferLoadLds(MachineInstr &MI) const; + bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; + bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; - std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, - bool AllowAbs = true) const; + std::pair<Register, unsigned> + selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true, + bool OpSel = false, bool ForceVGPR = false) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -173,15 +177,30 @@ private: selectVOP3Mods_nnan(MachineOperand &Root) const; std::pair<Register, unsigned> - selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, + bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3PModsDOT(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectDotIUVOP3PMods(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVINTERPMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVINTERPModsHi(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSmrdImm32(MachineOperand &Root) const; @@ -203,6 +222,10 @@ private: InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; + bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr, + uint64_t ImmOffset) const; + InstructionSelector::ComplexRendererFns + selectScratchSVAddr(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectMUBUFScratchOffen(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7d3dbfd7e851..31012915457b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -40,7 +40,7 @@ class AMDGPUInst <dag outs, dag ins, string asm = "", // instructions to not match without killing the whole decode process. It is // mainly used for ARM, but Tablegen expects this field to exist or it fails // to build the decode table. - field bits<64> SoftFail = 0; + field bits<96> SoftFail = 0; let DecoderNamespace = Namespace; @@ -87,6 +87,17 @@ class PredConcat<list<Predicate> lst, Predicate pred> { !listconcat([pred], !filter(item, lst, !ne(item, pred))); } +// Add a Register to the list if does not already exist +class RegAppend<list<Register> lst, Register reg> { + list<Register> ret = + !listconcat([reg], !filter(item, lst, !ne(item, reg))); +} +// Get the union of two Register lists +class RegListUnion<list<Register> lstA, list<Register> lstB> { + list<Register> ret = + !foldl(lstA, lstB, temp, item, RegAppend<temp, item>.ret); +} + class PredicateControl { Predicate SubtargetPredicate = TruePredicate; Predicate AssemblerPredicate = TruePredicate; @@ -444,34 +455,28 @@ def load_#as : PatFrag<(ops node:$ptr), (unindexedload node:$ptr)> { let IsNonExtLoad = 1; } -def extloadi8_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { +def extloadi8_#as : PatFrag<(ops node:$ptr), (extloadi8 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i8; } -def extloadi16_#as : PatFrag<(ops node:$ptr), (extload node:$ptr)> { +def extloadi16_#as : PatFrag<(ops node:$ptr), (extloadi16 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i16; } -def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { +def sextloadi8_#as : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i8; } -def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { +def sextloadi16_#as : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i16; } -def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { +def zextloadi8_#as : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i8; } -def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { +def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextloadi16 node:$ptr)> { let IsLoad = 1; - let MemoryVT = i16; } def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> { @@ -498,17 +503,15 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> { foreach as = [ "global", "flat", "local", "private", "region" ] in { -let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in { +let IsStore = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in { def store_#as : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr)> { - let IsStore = 1; let IsTruncStore = 0; } // truncstore fragments. def truncstore_#as : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr)> { - let IsStore = 1; let IsTruncStore = 1; } @@ -517,90 +520,133 @@ def truncstore_#as : PatFrag<(ops node:$val, node:$ptr), // unnecessary check that the memory size is less than the value type // in the generated matcher table. def truncstorei8_#as : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr)> { - let IsStore = 1; - let MemoryVT = i8; -} - + (truncstorei8 node:$val, node:$ptr)>; def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr)> { - let IsStore = 1; - let MemoryVT = i16; -} + (truncstorei16 node:$val, node:$ptr)>; def store_hi16_#as : StoreHi16 <truncstorei16, i16>; def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>; def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>; -defm atomic_store_#as : binary_atomic_op<atomic_store>; +} // End let IsStore = 1, AddressSpaces = ... -} // End let AddressSpaces +let IsAtomic = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in { +def atomic_store_8_#as : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_8 node:$ptr, node:$val)>; +def atomic_store_16_#as : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_16 node:$ptr, node:$val)>; +def atomic_store_32_#as : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_32 node:$ptr, node:$val)>; +def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_64 node:$ptr, node:$val)>; +} } // End foreach as +// TODO: Add GISelPredicateCode for the ret and noret PatFrags once +// GlobalISelEmitter allows pattern matches where src and dst def count +// mismatch. + +multiclass ret_noret_op { + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return true; }] in { + def "_ret" : PatFrag<(ops node:$ptr, node:$data), + (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; + } + + let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return false; }] in { + def "_noret" : PatFrag<(ops node:$ptr, node:$data), + (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; + } +} + +defm int_amdgcn_flat_atomic_fadd : ret_noret_op; +defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op; +defm int_amdgcn_flat_atomic_fmin : ret_noret_op; +defm int_amdgcn_flat_atomic_fmax : ret_noret_op; +defm int_amdgcn_global_atomic_fadd : ret_noret_op; +defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op; +defm int_amdgcn_global_atomic_fmin : ret_noret_op; +defm int_amdgcn_global_atomic_fmax : ret_noret_op; +defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op; multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return false; }] in { + defm "_noret" : binary_atomic_op<atomic_op, IsInt>; + } + + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return true; }] in { + defm "_ret" : binary_atomic_op<atomic_op, IsInt>; + } +} + +multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> { + let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return false; }] in { + defm "_noret" : ternary_atomic_op<atomic_op>; + } + + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], + GISelPredicateCode = [{ return true; }] in { + defm "_ret" : ternary_atomic_op<atomic_op>; + } +} + +multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - - let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in { - defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>; - } - - let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in { - defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>; - } + defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>; } } } -defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>; -defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>; -defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>; -defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>; -defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>; -defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>; -defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>; -defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>; -defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>; -defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>; -defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>; +defm atomic_swap : binary_atomic_op_all_as<atomic_swap>; +defm atomic_load_add : binary_atomic_op_all_as<atomic_load_add>; +defm atomic_load_and : binary_atomic_op_all_as<atomic_load_and>; +defm atomic_load_max : binary_atomic_op_all_as<atomic_load_max>; +defm atomic_load_min : binary_atomic_op_all_as<atomic_load_min>; +defm atomic_load_or : binary_atomic_op_all_as<atomic_load_or>; +defm atomic_load_sub : binary_atomic_op_all_as<atomic_load_sub>; +defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>; +defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>; +defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>; +defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>; let MemoryVT = v2f16 in -defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op<atomic_load_fadd, 0>; -defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>; +defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>; +defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, - Aligned<8> { + Aligned<8> { let IsLoad = 1; - let IsNonExtLoad = 1; } def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, Aligned<16> { let IsLoad = 1; - let IsNonExtLoad = 1; } def store_align8_local: PatFrag<(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)>, Aligned<8> { let IsStore = 1; - let IsTruncStore = 0; } def store_align16_local: PatFrag<(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)>, Aligned<16> { let IsStore = 1; - let IsTruncStore = 0; } let AddressSpaces = StoreAddress_local.AddrSpaces in { defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { -defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 645d05aa9238..01a3e78ea48c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" #define DEBUG_TYPE "amdgpu-legalinfo" @@ -134,7 +135,6 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { static LLT getBitcastRegisterType(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); - LLT CoercedTy; if (Size <= 32) { // <2 x s8> -> s16 // <4 x s8> -> s32 @@ -530,13 +530,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16, V2S16}) .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32) - .scalarize(0); + .custom(); + assert(ST.hasMad64_32()); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier @@ -546,13 +555,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .lower(); } else if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16}) .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32) .scalarize(0); + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16}) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + assert(ST.hasMad64_32()); + // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) @@ -569,12 +586,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); } else { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32}) .widenScalarToNextMultipleOf(0, 32) .clampScalar(0, S32, S32) .scalarize(0); + auto &Mul = getActionDefinitionsBuilder(G_MUL) + .legalFor({S32}) + .scalarize(0) + .minScalar(0, S32) + .widenScalarToNextMultipleOf(0, 32); + + if (ST.hasMad64_32()) + Mul.custom(); + else + Mul.maxScalar(0, S32); + if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) .legalFor({S32}) // Clamp modifier. @@ -632,7 +660,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}, {S32, S32}}) .minScalar(0, S32) - // TODO: .scalarize(0) + .scalarize(0) .lower(); getActionDefinitionsBuilder(G_BITCAST) @@ -767,13 +795,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) .scalarize(0); - getActionDefinitionsBuilder(G_FSUB) + auto &FSubActions = getActionDefinitionsBuilder(G_FSUB); + if (ST.has16BitInsts()) { + FSubActions + // Use actual fsub instruction + .legalFor({S32, S16}) + // Must use fadd + fneg + .lowerFor({S64, V2S16}); + } else { + FSubActions // Use actual fsub instruction .legalFor({S32}) // Must use fadd + fneg - .lowerFor({S64, S16, V2S16}) - .scalarize(0) - .clampScalar(0, S32, S64); + .lowerFor({S64, S16, V2S16}); + } + + FSubActions + .scalarize(0) + .clampScalar(0, S32, S64); // Whether this is legal depends on the floating point mode for the function. auto &FMad = getActionDefinitionsBuilder(G_FMAD); @@ -839,6 +878,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); + getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) + .customFor({S16, S32}) + .scalarize(0) + .lower(); + // Lower roundeven into G_FRINT getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) .scalarize(0) @@ -1292,6 +1336,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); + if (ST.hasGFX940Insts()) + Atomic.legalFor({{V2S16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); @@ -1505,7 +1551,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampMaxNumElements(1, S16, 2) // TODO: Make 4? .clampMaxNumElements(0, S16, 64); - // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse + // TODO: Don't fully scalarize v2s16 pieces? Or combine out those // pre-legalize. if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) @@ -1756,9 +1802,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeFFloor(MI, MRI, B); case TargetOpcode::G_BUILD_VECTOR: return legalizeBuildVector(MI, MRI, B); + case TargetOpcode::G_MUL: + return legalizeMul(Helper, MI); case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: return legalizeCTLZ_CTTZ(MI, MRI, B); + case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: + return legalizeFPTruncRound(MI, B); default: return false; } @@ -1801,6 +1851,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE + : AMDGPUTargetLowering::PRIVATE_BASE; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return Register(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(32), commonAlignment(Align(64), Offset)); + + // Pointer address + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); + } + Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); @@ -1811,17 +1894,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - Register LoadAddr; - - B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.buildPtrAdd(LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -1872,31 +1952,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - // Truncate. - B.buildExtract(Dst, Src, 0); - MI.eraseFromParent(); - return true; - } - - if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - uint32_t AddrHiVal = Info->get32BitAddressHighBits(); - - // FIXME: This is a bit ugly due to creating a merge of 2 pointers to - // another. Merge operands are required to be the same type, but creating an - // extra ptrtoint would be kind of pointless. - auto HighAddr = B.buildConstant( - LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); - B.buildMerge(Dst, {Src, HighAddr}); - MI.eraseFromParent(); - return true; - } - - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { - assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || - DestAS == AMDGPUAS::PRIVATE_ADDRESS); - + if (SrcAS == AMDGPUAS::FLAT_ADDRESS && + (DestAS == AMDGPUAS::LOCAL_ADDRESS || + DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { if (isKnownNonNull(Src, MRI, TM, SrcAS)) { // Extract low 32-bits of the pointer. B.buildExtract(Dst, Src, 0); @@ -1920,37 +1978,70 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) - return false; + if (DestAS == AMDGPUAS::FLAT_ADDRESS && + (SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { + if (!ST.hasFlatAddressSpace()) + return false; - if (!ST.hasFlatAddressSpace()) - return false; + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); - if (!ApertureReg.isValid()) - return false; + // Coerce the type of the low half of the result so we can use merge_values. + Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + + // TODO: Should we allow mismatched types but matching sizes in merges to + // avoid the ptrtoint? + auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + B.buildCopy(Dst, BuildPtr); + MI.eraseFromParent(); + return true; + } + + auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, + SegmentNull.getReg(0)); - // TODO: Should we allow mismatched types but matching sizes in merges to - // avoid the ptrtoint? - auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); - if (isKnownNonNull(Src, MRI, TM, SrcAS)) { - B.buildCopy(Dst, BuildPtr); MI.eraseFromParent(); return true; } - auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); - auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + SrcTy.getSizeInBits() == 64) { + // Truncate. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + DstTy.getSizeInBits() == 64) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + uint32_t AddrHiVal = Info->get32BitAddressHighBits(); - auto CmpRes = - B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); + // FIXME: This is a bit ugly due to creating a merge of 2 pointers to + // another. Merge operands are required to be the same type, but creating an + // extra ptrtoint would be kind of pointless. + auto HighAddr = B.buildConstant( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); + B.buildMerge(Dst, {Src, HighAddr}); + MI.eraseFromParent(); + return true; + } - B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); + LLVMContext &Ctx = MF.getFunction().getContext(); + Ctx.diagnose(InvalidAddrSpaceCast); + B.buildUndef(Dst); MI.eraseFromParent(); return true; } @@ -2811,6 +2902,298 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( return true; } +// Build a big integer multiply or multiply-add using MAD_64_32 instructions. +// +// Source and accumulation registers must all be 32-bits. +// +// TODO: When the multiply is uniform, we should produce a code sequence +// that is better suited to instruction selection on the SALU. Instead of +// the outer loop going over parts of the result, the outer loop should go +// over parts of one of the factors. This should result in instruction +// selection that makes full use of S_ADDC_U32 instructions. +void AMDGPULegalizerInfo::buildMultiply( + LegalizerHelper &Helper, MutableArrayRef<Register> Accum, + ArrayRef<Register> Src0, ArrayRef<Register> Src1, + bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { + // Use (possibly empty) vectors of S1 registers to represent the set of + // carries from one pair of positions to the next. + using Carry = SmallVector<Register, 2>; + + MachineIRBuilder &B = Helper.MIRBuilder; + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + + Register Zero32; + Register Zero64; + + auto getZero32 = [&]() -> Register { + if (!Zero32) + Zero32 = B.buildConstant(S32, 0).getReg(0); + return Zero32; + }; + auto getZero64 = [&]() -> Register { + if (!Zero64) + Zero64 = B.buildConstant(S64, 0).getReg(0); + return Zero64; + }; + + // Merge the given carries into the 32-bit LocalAccum, which is modified + // in-place. + // + // Returns the carry-out, which is a single S1 register or null. + auto mergeCarry = + [&](Register &LocalAccum, const Carry &CarryIn) -> Register { + if (CarryIn.empty()) + return Register(); + + bool HaveCarryOut = true; + Register CarryAccum; + if (CarryIn.size() == 1) { + if (!LocalAccum) { + LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); + return Register(); + } + + CarryAccum = getZero32(); + } else { + CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); + for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { + CarryAccum = + B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) + .getReg(0); + } + + if (!LocalAccum) { + LocalAccum = getZero32(); + HaveCarryOut = false; + } + } + + auto Add = + B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); + LocalAccum = Add.getReg(0); + return HaveCarryOut ? Add.getReg(1) : Register(); + }; + + // Build a multiply-add chain to compute + // + // LocalAccum + (partial products at DstIndex) + // + (opportunistic subset of CarryIn) + // + // LocalAccum is an array of one or two 32-bit registers that are updated + // in-place. The incoming registers may be null. + // + // In some edge cases, carry-ins can be consumed "for free". In that case, + // the consumed carry bits are removed from CarryIn in-place. + auto buildMadChain = + [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) + -> Carry { + assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || + (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); + + Carry CarryOut; + unsigned j0 = 0; + + // Use plain 32-bit multiplication for the most significant part of the + // result by default. + if (LocalAccum.size() == 1 && + (!UsePartialMad64_32 || !CarryIn.empty())) { + do { + unsigned j1 = DstIndex - j0; + auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); + if (!LocalAccum[0]) { + LocalAccum[0] = Mul.getReg(0); + } else { + if (CarryIn.empty()) { + LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); + } else { + LocalAccum[0] = + B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) + .getReg(0); + CarryIn.pop_back(); + } + } + ++j0; + } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); + } + + // Build full 64-bit multiplies. + if (j0 <= DstIndex) { + bool HaveSmallAccum = false; + Register Tmp; + + if (LocalAccum[0]) { + if (LocalAccum.size() == 1) { + Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); + HaveSmallAccum = true; + } else if (LocalAccum[1]) { + Tmp = B.buildMerge(S64, LocalAccum).getReg(0); + HaveSmallAccum = false; + } else { + Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); + HaveSmallAccum = true; + } + } else { + assert(LocalAccum.size() == 1 || !LocalAccum[1]); + Tmp = getZero64(); + HaveSmallAccum = true; + } + + do { + unsigned j1 = DstIndex - j0; + auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, + {Src0[j0], Src1[j1], Tmp}); + Tmp = Mad.getReg(0); + if (!HaveSmallAccum) + CarryOut.push_back(Mad.getReg(1)); + HaveSmallAccum = false; + ++j0; + } while (j0 <= DstIndex); + + auto Unmerge = B.buildUnmerge(S32, Tmp); + LocalAccum[0] = Unmerge.getReg(0); + if (LocalAccum.size() > 1) + LocalAccum[1] = Unmerge.getReg(1); + } + + return CarryOut; + }; + + // Outer multiply loop, iterating over destination parts from least + // significant to most significant parts. + // + // The columns of the following diagram correspond to the destination parts + // affected by one iteration of the outer loop (ignoring boundary + // conditions). + // + // Dest index relative to 2 * i: 1 0 -1 + // ------ + // Carries from previous iteration: e o + // Even-aligned partial product sum: E E . + // Odd-aligned partial product sum: O O + // + // 'o' is OddCarry, 'e' is EvenCarry. + // EE and OO are computed from partial products via buildMadChain and use + // accumulation where possible and appropriate. + // + Register SeparateOddCarry; + Carry EvenCarry; + Carry OddCarry; + + for (unsigned i = 0; i <= Accum.size() / 2; ++i) { + Carry OddCarryIn = std::move(OddCarry); + Carry EvenCarryIn = std::move(EvenCarry); + OddCarry.clear(); + EvenCarry.clear(); + + // Partial products at offset 2 * i. + if (2 * i < Accum.size()) { + auto LocalAccum = Accum.drop_front(2 * i).take_front(2); + EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); + } + + // Partial products at offset 2 * i - 1. + if (i > 0) { + if (!SeparateOddAlignedProducts) { + auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); + OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); + } else { + bool IsHighest = 2 * i >= Accum.size(); + Register SeparateOddOut[2]; + auto LocalAccum = makeMutableArrayRef(SeparateOddOut) + .take_front(IsHighest ? 1 : 2); + OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); + + MachineInstr *Lo; + + if (i == 1) { + if (!IsHighest) + Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); + else + Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); + } else { + Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], + SeparateOddCarry); + } + Accum[2 * i - 1] = Lo->getOperand(0).getReg(); + + if (!IsHighest) { + auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], + Lo->getOperand(1).getReg()); + Accum[2 * i] = Hi.getReg(0); + SeparateOddCarry = Hi.getReg(1); + } + } + } + + // Add in the carries from the previous iteration + if (i > 0) { + if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) + EvenCarryIn.push_back(CarryOut); + + if (2 * i < Accum.size()) { + if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) + OddCarry.push_back(CarryOut); + } + } + } +} + +// Custom narrowing of wide multiplies using wide multiply-add instructions. +// +// TODO: If the multiply is followed by an addition, we should attempt to +// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. +bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, + MachineInstr &MI) const { + assert(ST.hasMad64_32()); + assert(MI.getOpcode() == TargetOpcode::G_MUL); + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + + LLT Ty = MRI.getType(DstReg); + assert(Ty.isScalar()); + + unsigned Size = Ty.getSizeInBits(); + unsigned NumParts = Size / 32; + assert((Size % 32) == 0); + assert(NumParts >= 2); + + // Whether to use MAD_64_32 for partial products whose high half is + // discarded. This avoids some ADD instructions but risks false dependency + // stalls on some subtargets in some cases. + const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; + + // Whether to compute odd-aligned partial products separately. This is + // advisable on subtargets where the accumulator of MAD_64_32 must be placed + // in an even-aligned VGPR. + const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); + + LLT S32 = LLT::scalar(32); + SmallVector<Register, 2> Src0Parts, Src1Parts; + for (unsigned i = 0; i < NumParts; ++i) { + Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); + Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); + } + B.buildUnmerge(Src0Parts, Src0); + B.buildUnmerge(Src1Parts, Src1); + + SmallVector<Register, 2> AccumRegs(NumParts); + buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, + SeparateOddAlignedProducts); + + B.buildMerge(DstReg, AccumRegs); + MI.eraseFromParent(); + return true; + +} + // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input // case with a single min instruction instead of a compare+select. @@ -2954,6 +3337,89 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, + int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); + if (MaxID == 0) + return replaceWithConstant(B, MI, 0); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *Arg; + const TargetRegisterClass *ArgRC; + LLT ArgTy; + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + Register DstReg = MI.getOperand(0).getReg(); + if (!Arg) { + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; + } + + if (Arg->isMasked()) { + // Don't bother inserting AssertZext for packed IDs since we're emitting the + // masking operations anyway. + // + // TODO: We could assert the top bit is 0 for the source copy. + if (!loadInputValue(DstReg, B, ArgType)) + return false; + } else { + Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + if (!loadInputValue(TmpReg, B, ArgType)) + return false; + B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); + } + + MI.eraseFromParent(); + return true; +} + +Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, + int64_t Offset) const { + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + + // TODO: If we passed in the base kernel offset we could have a better + // alignment than 4, but we don't really need it. + if (!loadInputValue(KernArgReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + llvm_unreachable("failed to find kernarg segment ptr"); + + auto COffset = B.buildConstant(LLT::scalar(64), Offset); + // TODO: Should get nuw + return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); +} + +/// Legalize a value that's loaded from kernel arguments. This is only used by +/// legacy intrinsics. +bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, + MachineIRBuilder &B, + uint64_t Offset, + Align Alignment) const { + Register DstReg = MI.getOperand(0).getReg(); + + assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && + "unexpected kernarg parameter type"); + + Register Ptr = getKernargParameterPtr(B, Offset); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -3688,9 +4154,9 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, // The remaining operands were used to set fields in the MemOperand on // construction. for (int I = 6; I > 3; --I) - MI.RemoveOperand(I); + MI.removeOperand(I); - MI.RemoveOperand(1); // Remove the intrinsic ID. + MI.removeOperand(1); // Remove the intrinsic ID. Observer.changedInstr(MI); return true; } @@ -4359,7 +4825,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, /// /// We don't want to directly select image instructions just yet, but also want /// to exposes all register repacking to the legalizer/combiners. We also don't -/// want a selected instrution entering RegBankSelect. In order to avoid +/// want a selected instruction entering RegBankSelect. In order to avoid /// defining a multitude of intermediate image instructions, directly hack on /// the intrinsic's arguments. In cases like a16 addresses, this requires /// padding now unnecessary arguments with $noreg. @@ -4508,6 +4974,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. + // + // TODO: we can actually allow partial NSA where the final register is a + // contiguous set of the remaining addresses. + // This could help where there are more addresses than supported. const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && CorrectedNumVAddrs <= ST.getNSAMaxSize(); @@ -4607,7 +5077,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return false; // TODO: Make sure the TFE operand bit is set. - MI.RemoveOperand(1); + MI.removeOperand(1); // Handle the easy case that requires no repack instructions. if (Ty == S32) { @@ -4737,7 +5207,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); - MI.RemoveOperand(1); // Remove intrinsic ID + MI.removeOperand(1); // Remove intrinsic ID // FIXME: When intrinsic definition is fixed, this should have an MMO already. // TODO: Should this use datalayout alignment? @@ -4797,6 +5267,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm( bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const LLT S64 = LLT::scalar(64); + + Register SGPR01(AMDGPU::SGPR0_SGPR1); + // For code object version 5, queue_ptr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AMDGPUTargetLowering::QUEUE_PTR; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return false; + + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(64), commonAlignment(Align(64), Offset)); + + // Pointer address + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); + B.buildCopy(SGPR01, Temp); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) + .addReg(SGPR01, RegState::Implicit); + MI.eraseFromParent(); + return true; + } + // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi Register LiveIn = @@ -4804,7 +5315,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; - Register SGPR01(AMDGPU::SGPR0_SGPR1); B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) @@ -4848,6 +5358,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI = *B.getMRI(); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); + const LLT V2S16 = LLT::fixed_vector(2, 16); + const LLT V3S32 = LLT::fixed_vector(3, 32); Register DstReg = MI.getOperand(0).getReg(); Register NodePtr = MI.getOperand(2).getReg(); @@ -4865,61 +5377,98 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return false; } + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = - ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); + const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; + const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; int Opcode; if (UseNSA) { - Opcode = - AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, - NumVDataDwords, NumVAddrDwords); - } else { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - AMDGPU::MIMGEncGfx10Default, NumVDataDwords, - PowerOf2Ceil(NumVAddrDwords)); + IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode( + BaseOpcodes[Is64][IsA16], + IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); } assert(Opcode != -1); SmallVector<Register, 12> Ops; - if (Is64) { - auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - } else { - Ops.push_back(NodePtr); - } - Ops.push_back(RayExtent); + if (UseNSA && IsGFX11Plus) { + auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + auto Merged = B.buildMerge( + V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); + Ops.push_back(Merged.getReg(0)); + }; - auto packLanes = [&Ops, &S32, &B](Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - Ops.push_back(Unmerge.getReg(2)); - }; + Ops.push_back(NodePtr); + Ops.push_back(RayExtent); + packLanes(RayOrigin); - packLanes(RayOrigin); - if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); - Register R1 = MRI.createGenericVirtualRegister(S32); - Register R2 = MRI.createGenericVirtualRegister(S32); - Register R3 = MRI.createGenericVirtualRegister(S32); - B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); - B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); - B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); - Ops.push_back(R1); - Ops.push_back(R2); - Ops.push_back(R3); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + auto MergedDir = B.buildMerge( + V3S32, + {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), + UnmergeRayDir.getReg(0)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), + UnmergeRayDir.getReg(1)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), + UnmergeRayDir.getReg(2)})) + .getReg(0)}); + Ops.push_back(MergedDir.getReg(0)); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } } else { - packLanes(RayDir); - packLanes(RayInvDir); + if (Is64) { + auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + } else { + Ops.push_back(NodePtr); + } + Ops.push_back(RayExtent); + + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + Ops.push_back(Unmerge.getReg(2)); + }; + + packLanes(RayOrigin); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + Register R1 = MRI.createGenericVirtualRegister(S32); + Register R2 = MRI.createGenericVirtualRegister(S32); + Register R3 = MRI.createGenericVirtualRegister(S32); + B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); + B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); + B.buildMerge(R3, + {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); + Ops.push_back(R1); + Ops.push_back(R2); + Ops.push_back(R3); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } } if (!UseNSA) { @@ -4946,9 +5495,24 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return true; } -static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { - B.buildConstant(MI.getOperand(0).getReg(), C); +bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, + MachineIRBuilder &B) const { + unsigned Opc; + int RoundMode = MI.getOperand(2).getImm(); + + if (RoundMode == (int)RoundingMode::TowardPositive) + Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; + else if (RoundMode == (int)RoundingMode::TowardNegative) + Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; + else + return false; + + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return true; } @@ -5055,22 +5619,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) - return replaceWithConstant(B, MI, 0); - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_X); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); @@ -5092,6 +5648,31 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::r600_read_ngroups_x: + // TODO: Emit error for hsa + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_X); + case Intrinsic::r600_read_ngroups_y: + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_Y); + case Intrinsic::r600_read_ngroups_z: + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_Z); + case Intrinsic::r600_read_local_size_x: + // TODO: Could insert G_ASSERT_ZEXT from s16 + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); + case Intrinsic::r600_read_local_size_y: + // TODO: Could insert G_ASSERT_ZEXT from s16 + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); + // TODO: Could insert G_ASSERT_ZEXT from s16 + case Intrinsic::r600_read_local_size_z: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); + case Intrinsic::r600_read_global_size_x: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); + case Intrinsic::r600_read_global_size_y: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); + case Intrinsic::r600_read_global_size_z: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); case Intrinsic::amdgcn_fdiv_fast: return legalizeFDIVFastIntrin(MI, MRI, B); case Intrinsic::amdgcn_is_shared: @@ -5157,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { + if (!MRI.use_empty(DstReg) && + !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { Function &F = B.getMF().getFunction(); DiagnosticInfoUnsupported NoFpRet( F, "return versions of fp atomics not supported", B.getDebugLoc(), diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 964a41d3d740..cee533aa34ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -88,6 +88,12 @@ public: bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + + void buildMultiply(LegalizerHelper &Helper, MutableArrayRef<Register> Accum, + ArrayRef<Register> Src0, ArrayRef<Register> Src1, + bool UsePartialMad64_32, + bool SeparateOddAlignedProducts) const; + bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -96,9 +102,18 @@ public: const TargetRegisterClass *ArgRC, LLT ArgTy) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeWorkitemIDIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + + Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const; + bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, + uint64_t Offset, + Align Alignment = Align(4)) const; bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -169,6 +184,8 @@ public: bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index bbbadfdfd444..78e092b2e872 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1593,8 +1593,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { // max vector size is 16, and sincos will generate two results. double DVal0[16], DVal1[16]; + int FuncVecSize = getVecSize(FInfo); bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); - if (getVecSize(FInfo) == 1) { + if (FuncVecSize == 1) { if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1, copr2)) { return false; @@ -1603,7 +1604,7 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0); ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1); ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2); - for (int i=0; i < getVecSize(FInfo); ++i) { + for (int i = 0; i < FuncVecSize; ++i) { Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr; @@ -1616,19 +1617,19 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { LLVMContext &context = CI->getParent()->getParent()->getContext(); Constant *nval0, *nval1; - if (getVecSize(FInfo) == 1) { + if (FuncVecSize == 1) { nval0 = ConstantFP::get(CI->getType(), DVal0[0]); if (hasTwoResults) nval1 = ConstantFP::get(CI->getType(), DVal1[0]); } else { if (getArgType(FInfo) == AMDGPULibFunc::F32) { SmallVector <float, 0> FVal0, FVal1; - for (int i=0; i < getVecSize(FInfo); ++i) + for (int i = 0; i < FuncVecSize; ++i) FVal0.push_back((float)DVal0[i]); ArrayRef<float> tmp0(FVal0); nval0 = ConstantDataVector::get(context, tmp0); if (hasTwoResults) { - for (int i=0; i < getVecSize(FInfo); ++i) + for (int i = 0; i < FuncVecSize; ++i) FVal1.push_back((float)DVal1[i]); ArrayRef<float> tmp1(FVal1); nval1 = ConstantDataVector::get(context, tmp1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h index dc0ac72016f3..bf0fda25b2c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -324,8 +324,8 @@ public: class AMDGPULibFuncImpl : public AMDGPULibFuncBase { public: - AMDGPULibFuncImpl() {} - virtual ~AMDGPULibFuncImpl() {} + AMDGPULibFuncImpl() = default; + virtual ~AMDGPULibFuncImpl() = default; /// Get unmangled name for mangled library function and name for unmangled /// library function. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index b700dd5aa301..93d1eed2cf63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -13,7 +13,6 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" @@ -156,11 +155,8 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) { Changed = true; break; - case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: - case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: - case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: case Intrinsic::r600_read_local_size_x: case Intrinsic::r600_read_local_size_y: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index c34c12ab9fec..2e5c35f1f571 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -73,7 +73,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); Align MaxAlign; - // FIXME: Alignment is broken broken with explicit arg offset.; + // FIXME: Alignment is broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) return false; @@ -92,9 +92,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); - MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None; - if (!ABITypeAlign) - ABITypeAlign = DL.getABITypeAlign(ArgTy); + MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : None; + Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); uint64_t Size = DL.getTypeSizeInBits(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 08a1b970648d..f5903b3afb81 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -163,39 +163,29 @@ static bool processUse(CallInst *CI) { if (!GroupSize || !GridSize) continue; + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + for (User *U : GroupSize->users()) { auto *ZextGroupSize = dyn_cast<ZExtInst>(U); if (!ZextGroupSize) continue; - for (User *ZextUser : ZextGroupSize->users()) { - auto *SI = dyn_cast<SelectInst>(ZextUser); - if (!SI) - continue; - - using namespace llvm::PatternMatch; - auto GroupIDIntrin = I == 0 ? - m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() : - (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() : - m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); - - auto SubExpr = m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))); - - ICmpInst::Predicate Pred; - if (match(SI, - m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)), - SubExpr, - m_Specific(ZextGroupSize))) && - Pred == ICmpInst::ICMP_ULT) { + for (User *UMin : ZextGroupSize->users()) { + if (match(UMin, + m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { if (HasReqdWorkGroupSize) { ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); - SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize, - SI->getType(), - false)); + UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( + KnownSize, UMin->getType(), false)); } else { - SI->replaceAllUsesWith(ZextGroupSize); + UMin->replaceAllUsesWith(ZextGroupSize); } MadeChange = true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 6e2b5dc471bc..35922341de26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -14,7 +14,7 @@ // known address. AMDGPUMachineFunction allocates the LDS global. // // Local variables with constant annotation or non-undef initializer are passed -// through unchanged for simplication or error diagnostics in later passes. +// through unchanged for simplification or error diagnostics in later passes. // // To reduce the memory overhead variables that are only used by kernels are // excluded from this transform. The analysis to determine whether a variable @@ -28,8 +28,9 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "Utils/AMDGPULDSUtils.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -163,9 +164,10 @@ public: } bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); UsedList = getUsedList(M); bool Changed = superAlignLDSGlobals(M); - Changed |= processUsedLDS(M); + Changed |= processUsedLDS(CG, M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -174,7 +176,7 @@ public: // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; - Changed |= processUsedLDS(M, &F); + Changed |= processUsedLDS(CG, M, &F); } UsedList.clear(); @@ -226,7 +228,7 @@ private: return Changed; } - bool processUsedLDS(Module &M, Function *F = nullptr) { + bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -374,7 +376,20 @@ private: IRBuilder<> Builder(Ctx); for (Function &Func : M.functions()) { if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - markUsedByKernel(Builder, &Func, SGV); + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds"); + } } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 3fad7e192195..ed6ddbf426fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -120,8 +120,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We // need to select it to the subtarget specific version, and there's no way to // do that with a single pseudo source operation. - if (Opcode == AMDGPU::S_SETPC_B64_return || - Opcode == AMDGPU::S_SETPC_B64_return_gfx) + if (Opcode == AMDGPU::S_SETPC_B64_return) Opcode = AMDGPU::S_SETPC_B64; else if (Opcode == AMDGPU::SI_CALL) { // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the @@ -208,6 +207,16 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) { + if (isVerbose()) { + std::string HexString; + raw_string_ostream HexStream(HexString); + HexStream << format_hex(MI->getOperand(0).getImm(), 10, true); + OutStreamer->emitRawComment(" sched_barrier mask(" + HexString + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { if (isVerbose()) OutStreamer->emitRawComment(" divergent unreachable"); @@ -240,7 +249,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { raw_svector_ostream CodeStream(CodeBytes); std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter( - *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext)); + *STI.getInstrInfo(), OutContext)); InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI); assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 0e43b4fe9461..5c656f158e71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -1,4 +1,4 @@ -//===- AMDGPUMCInstLower.h - Lower AMDGPU MachineInstr to an MCInst -------===// +//===- AMDGPUMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp index c3441f81a78e..0712466a0e88 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.cpp @@ -21,17 +21,18 @@ bool AMDGPUMIRFormatter::parseCustomPseudoSourceValue( StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS, const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo(); + const AMDGPUTargetMachine &TM = + static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); if (Src == "BufferResource") { - PSV = MFI->getBufferPSV(TII); + PSV = MFI->getBufferPSV(TM); return false; } if (Src == "ImageResource") { - PSV = MFI->getImagePSV(TII); + PSV = MFI->getImagePSV(TM); return false; } if (Src == "GWSResource") { - PSV = MFI->getGWSPSV(TII); + PSV = MFI->getGWSPSV(TM); return false; } llvm_unreachable("unknown MIR custom pseudo source value"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h index 47faa6c72481..753f7edc9385 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h @@ -25,7 +25,7 @@ struct PerFunctionMIParsingState; class AMDGPUMIRFormatter final : public MIRFormatter { public: - AMDGPUMIRFormatter() {} + AMDGPUMIRFormatter() = default; virtual ~AMDGPUMIRFormatter() = default; /// Implement target specific parsing of target custom pseudo source value. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 4e2f98d2a5db..d837f8cb2f60 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1295,7 +1295,7 @@ static void fixRegionTerminator(RegionMRT *Region) { } } -// If a region region is just a sequence of regions (and the exit +// If a region is just a sequence of regions (and the exit // block in the case of the top level region), we can simply skip // linearizing it, because it is already linear bool regionIsSequence(RegionMRT *Region) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 593388a4d819..b461c3c4bfdc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -32,6 +33,15 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter"); WaveLimiter = WaveLimitAttr.getValueAsBool(); + // FIXME: How is this attribute supposed to interact with statically known + // global sizes? + StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); + + // Assume the attribute allocates before any known GDS globals. + StaticGDSSize = GDSSize; + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); @@ -46,25 +56,43 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. - unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); + unsigned Offset; + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); - Entry.first->second = Offset; - StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); + StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); - // Update the LDS size considering the padding to align the dynamic shared - // memory. - LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + // Update the LDS size considering the padding to align the dynamic shared + // memory. + LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + } else { + assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS && + "expected region address space"); + Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment); + StaticGDSSize += DL.getTypeAllocSize(GV.getValueType()); + + // FIXME: Apply alignment of dynamic GDS + GDSSize = StaticGDSSize; + } + + Entry.first->second = Offset; return Offset; } -void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { +// This kernel calls no functions that require the module lds struct +static bool canElideModuleLDS(const Function &F) { + return F.hasFnAttribute("amdgpu-elide-module-lds"); +} + +void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) { + const Module *M = F.getParent(); if (isModuleEntryFunction()) { const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); - if (GV) { + if (GV && !canElideModuleLDS(F)) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); (void)Offset; assert(Offset == 0 && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 48cf46b5f871..df62c2314617 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -12,6 +12,10 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Function.h" namespace llvm { @@ -25,11 +29,13 @@ protected: Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. - unsigned LDSSize = 0; + uint32_t LDSSize = 0; + uint32_t GDSSize = 0; /// Number of bytes in the LDS allocated statically. This field is only used /// in the instruction selector and not part of the machine function info. - unsigned StaticLDSSize = 0; + uint32_t StaticLDSSize = 0; + uint32_t StaticGDSSize = 0; /// Align for dynamic shared memory if any. Dynamic shared memory is /// allocated directly after the static one, i.e., LDSSize. Need to pad @@ -63,12 +69,16 @@ public: return ExplicitKernArgSize; } - unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); } + Align getMaxKernArgAlign() const { return MaxKernArgAlign; } - unsigned getLDSSize() const { + uint32_t getLDSSize() const { return LDSSize; } + uint32_t getGDSSize() const { + return GDSSize; + } + AMDGPU::SIModeRegisterDefaults getMode() const { return Mode; } @@ -92,7 +102,7 @@ public: } unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); - void allocateModuleLDSGlobal(const Module *M); + void allocateModuleLDSGlobal(const Function &F); Align getDynLDSAlign() const { return DynLDSAlign; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 6646cce8186b..2d48be9ea542 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMachineModuleInfo.h" +#include "llvm/MC/MCSymbol.h" namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 5a5a5d213a1a..fb7709d66c76 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -34,6 +34,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallString.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" @@ -71,7 +72,7 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { return new AMDGPUOpenCLEnqueuedBlockLowering(); } -/// Collect direct or indrect callers of \p F and save them +/// Collect direct or indirect callers of \p F and save them /// to \p Callers. static void collectCallers(Function *F, DenseSet<Function *> &Callers) { for (auto U : F->users()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 8ad344816ad2..09dbd2150db6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -116,7 +116,6 @@ private: bool isGlobalAddr(const Value *V) const; bool isLocalAddr(const Value *V) const; - bool isConstantAddr(const Value *V) const; }; static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( @@ -153,7 +152,7 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { if (auto LD = dyn_cast<LoadInst>(V)) { auto M = LD->getPointerOperand(); - if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { + if (isGlobalAddr(M)) { LLVM_DEBUG(dbgs() << " is IA\n"); return true; } @@ -267,19 +266,23 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) { << " LSMInst cost: " << Info->LSMInstCost << '\n' << " TotalInst cost: " << Info->InstCost << '\n'); + bool Changed = false; + if (isMemBound(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); NumMemBound++; F.addFnAttr("amdgpu-memory-bound", "true"); + Changed = true; } if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); NumLimitWave++; F.addFnAttr("amdgpu-wave-limiter", "true"); + Changed = true; } - return true; + return Changed; } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { @@ -332,15 +335,6 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { return MAI; } -bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { - if (auto PT = dyn_cast<PointerType>(V->getType())) { - unsigned As = PT->getAddressSpace(); - return As == AMDGPUAS::CONSTANT_ADDRESS || - As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; - } - return false; -} - bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( MemAccessInfo &Reference) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index c029046ab65f..bfe2e9b66ed4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -16,6 +16,7 @@ #include "AMDGPULegalizerInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -125,7 +126,6 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( LLT::scalar(64)); const LLT S32 = LLT::scalar(32); - B.setMBB(*MI.getParent()); B.setInstrAndDebugLoc(MI); auto Unmerge = B.buildUnmerge(S32, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index f91f31508ad2..1db7c18e4598 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Dominators.h" @@ -66,7 +67,7 @@ private: Value *simplify(Instruction *I, const TargetLibraryInfo *TLI, const DominatorTree *DT) { - return SimplifyInstruction(I, {*TD, TLI, DT}); + return simplifyInstruction(I, {*TD, TLI, DT}); } const DataLayout *TD; @@ -562,15 +563,6 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) { if (Printfs.empty()) return false; - if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) { - for (auto &U : HostcallFunction->uses()) { - if (auto *CI = dyn_cast<CallInst>(U.getUser())) { - M.getContext().emitError( - CI, "Cannot use both printf and hostcall in the same module"); - } - } - } - TD = &M.getDataLayout(); return lowerPrintfForGpu(M); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 99b7ffb33884..5a4426ba8113 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { ArrayTy->getNumElements()); } -static Value *stripBitcasts(Value *V) { - while (Instruction *I = dyn_cast<Instruction>(V)) { - if (I->getOpcode() != Instruction::BitCast) - break; - V = I->getOperand(0); - } - return V; -} - static Value * calculateVectorIndex(Value *Ptr, const std::map<GetElementPtrInst *, Value *> &GEPIdx) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr)); + auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()); if (!GEP) - return nullptr; + return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); auto I = GEPIdx.find(GEP); - return I == GEPIdx.end() ? nullptr : I->second; + assert(I != GEPIdx.end() && "Must have entry for GEP!"); + return I->second; } -static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { - // FIXME we only support simple cases - if (GEP->getNumOperands() != 3) +static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { + // TODO: Extracting a "multiple of X" from a GEP might be a useful generic + // helper. + unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); + MapVector<Value *, APInt> VarOffsets; + APInt ConstOffset(BW, 0); + if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || + !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) return nullptr; - ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); - if (!I0 || !I0->isZero()) + unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy); + if (VarOffsets.size() > 1) return nullptr; - return GEP->getOperand(2); -} - -// Not an instruction handled below to turn into a vector. -// -// TODO: Check isTriviallyVectorizable for calls and handle other -// instructions. -static bool canVectorizeInst(Instruction *Inst, User *User, - const DataLayout &DL) { - switch (Inst->getOpcode()) { - case Instruction::Load: { - // Currently only handle the case where the Pointer Operand is a GEP. - // Also we could not vectorize volatile or atomic loads. - LoadInst *LI = cast<LoadInst>(Inst); - if (isa<AllocaInst>(User) && - LI->getPointerOperandType() == User->getType() && - isa<VectorType>(LI->getType())) - return true; - - Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand()); - if (!PtrInst) - return false; - - return (PtrInst->getOpcode() == Instruction::GetElementPtr || - PtrInst->getOpcode() == Instruction::BitCast) && - LI->isSimple(); + if (VarOffsets.size() == 1) { + // Only handle cases where we don't need to insert extra arithmetic + // instructions. + const auto &VarOffset = VarOffsets.front(); + if (!ConstOffset.isZero() || VarOffset.second != VecElemSize) + return nullptr; + return VarOffset.first; } - case Instruction::BitCast: - return true; - case Instruction::Store: { - // Must be the stored pointer operand, not a stored value, plus - // since it should be canonical form, the User should be a GEP. - // Also we could not vectorize volatile or atomic stores. - StoreInst *SI = cast<StoreInst>(Inst); - if (isa<AllocaInst>(User) && - SI->getPointerOperandType() == User->getType() && - isa<VectorType>(SI->getValueOperand()->getType())) - return true; - Instruction *UserInst = dyn_cast<Instruction>(User); - if (!UserInst) - return false; + APInt Quot; + uint64_t Rem; + APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem); + if (Rem != 0) + return nullptr; - return (SI->getPointerOperand() == User) && - (UserInst->getOpcode() == Instruction::GetElementPtr || - UserInst->getOpcode() == Instruction::BitCast) && - SI->isSimple(); - } - default: - return false; - } + return ConstantInt::get(GEP->getContext(), Quot); } static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, @@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } std::map<GetElementPtrInst*, Value*> GEPVectorIdx; - std::vector<Value *> WorkList; - SmallVector<User *, 8> Users(Alloca->users()); - SmallVector<User *, 8> UseUsers(Users.size(), Alloca); + SmallVector<Instruction *> WorkList; + SmallVector<Use *, 8> Uses; + for (Use &U : Alloca->uses()) + Uses.push_back(&U); + Type *VecEltTy = VectorTy->getElementType(); - while (!Users.empty()) { - User *AllocaUser = Users.pop_back_val(); - User *UseUser = UseUsers.pop_back_val(); - Instruction *Inst = dyn_cast<Instruction>(AllocaUser); + while (!Uses.empty()) { + Use *U = Uses.pop_back_val(); + Instruction *Inst = dyn_cast<Instruction>(U->getUser()); - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(Inst, UseUser, DL)) + if (Value *Ptr = getLoadStorePointerOperand(Inst)) { + // This is a store of the pointer, not to the pointer. + if (isa<StoreInst>(Inst) && + U->getOperandNo() != StoreInst::getPointerOperandIndex()) return false; - if (Inst->getOpcode() == Instruction::BitCast) { - Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); - Type *ToTy = Inst->getType()->getPointerElementType(); - if (FromTy->isAggregateType() || ToTy->isAggregateType() || - DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) - continue; - - for (User *CastUser : Inst->users()) { - if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser))) - continue; - Users.push_back(CastUser); - UseUsers.push_back(Inst); - } + Type *AccessTy = getLoadStoreType(Inst); + Ptr = Ptr->stripPointerCasts(); + // Alloca already accessed as vector, leave alone. + if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) == + DL.getTypeStoreSize(AccessTy)) continue; - } - WorkList.push_back(AllocaUser); + // Check that this is a simple access of a vector element. + bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple() + : cast<StoreInst>(Inst)->isSimple(); + if (!IsSimple || + !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL)) + return false; + + WorkList.push_back(Inst); continue; } - Value *Index = GEPToVectorIndex(GEP); + if (isa<BitCastInst>(Inst)) { + // Look through bitcasts. + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; + } - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP - << '\n'); - return false; + if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) { + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL); + if (!Index) { + LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP + << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; } - GEPVectorIdx[GEP] = Index; - Users.append(GEP->user_begin(), GEP->user_end()); - UseUsers.append(GEP->getNumUses(), GEP); + // Ignore assume-like intrinsics and comparisons used in assumes. + if (isAssumeLikeIntrinsic(Inst)) + continue; + + if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) { + return isAssumeLikeIntrinsic(cast<Instruction>(U)); + })) + continue; + + // Unknown user. + return false; } LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); - for (Value *V : WorkList) { - Instruction *Inst = cast<Instruction>(V); + for (Instruction *Inst : WorkList) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) - break; - Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); @@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(Inst); - if (SI->getValueOperand()->getType() == AllocaTy || - SI->getValueOperand()->getType()->isVectorTy()) - break; - Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *Elt = SI->getValueOperand(); @@ -808,10 +778,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { // // FIXME: We should really do something to fix the addresses to a more optimal // value instead - llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS, - std::pair<uint64_t, Align> RHS) { - return LHS.second < RHS.second; - }); + llvm::sort(AllocatedSizes, llvm::less_second()); // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; @@ -917,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory - // could could end up using more than the maximum due to alignment padding. + // could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp index 01d03d17ec47..ed450f59e4b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -16,7 +16,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" @@ -30,6 +32,8 @@ namespace { class AMDGPUPromoteKernelArguments : public FunctionPass { MemorySSA *MSSA; + AliasAnalysis *AA; + Instruction *ArgCastInsertPt; SmallVector<Value *> Ptrs; @@ -38,16 +42,19 @@ class AMDGPUPromoteKernelArguments : public FunctionPass { bool promotePointer(Value *Ptr); + bool promoteLoad(LoadInst *LI); + public: static char ID; AMDGPUPromoteKernelArguments() : FunctionPass(ID) {} - bool run(Function &F, MemorySSA &MSSA); + bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA); bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MemorySSAWrapperPass>(); AU.setPreservesAll(); } @@ -68,17 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { break; case Instruction::Load: { LoadInst *LD = cast<LoadInst>(U); - PointerType *PT = dyn_cast<PointerType>(LD->getType()); - if (!PT || - (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && - PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) || - LD->getPointerOperand()->stripInBoundsOffsets() != Ptr) - break; - const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD); - // TODO: This load poprobably can be promoted to constant address space. - if (MSSA->isLiveOnEntryDef(MA)) + if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr && + !AMDGPU::isClobberedInFunction(LD, MSSA, AA)) Ptrs.push_back(LD); + break; } case Instruction::GetElementPtr: @@ -92,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { } bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { - enqueueUsers(Ptr); + bool Changed = false; + + LoadInst *LI = dyn_cast<LoadInst>(Ptr); + if (LI) + Changed |= promoteLoad(LI); + + PointerType *PT = dyn_cast<PointerType>(Ptr->getType()); + if (!PT) + return Changed; + + if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + enqueueUsers(Ptr); - PointerType *PT = cast<PointerType>(Ptr->getType()); if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) - return false; + return Changed; - bool IsArg = isa<Argument>(Ptr); - IRBuilder<> B(IsArg ? ArgCastInsertPt - : &*std::next(cast<Instruction>(Ptr)->getIterator())); + IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator()) + : ArgCastInsertPt); // Cast pointer to global address space and back to flat and let // Infer Address Spaces pass to do all necessary rewriting. @@ -116,6 +127,14 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { return true; } +bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) { + if (!LI->isSimple()) + return false; + + LI->setMetadata("amdgpu.noclobber", MDNode::get(LI->getContext(), {})); + return true; +} + // skip allocas static BasicBlock::iterator getInsertPt(BasicBlock &BB) { BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); @@ -131,7 +150,8 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } -bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) { +bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA, + AliasAnalysis &AA) { if (skipFunction(F)) return false; @@ -141,6 +161,7 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) { ArgCastInsertPt = &*getInsertPt(*F.begin()); this->MSSA = &MSSA; + this->AA = &AA; for (Argument &Arg : F.args()) { if (Arg.use_empty()) @@ -166,11 +187,13 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) { bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) { MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA(); - return run(F, MSSA); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + return run(F, MSSA, AA); } INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE, "AMDGPU Promote Kernel Arguments", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE, "AMDGPU Promote Kernel Arguments", false, false) @@ -185,7 +208,8 @@ PreservedAnalyses AMDGPUPromoteKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); - if (AMDGPUPromoteKernelArguments().run(F, MSSA)) { + AliasAnalysis &AA = AM.getResult<AAManager>(F); + if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) { PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); PA.preserve<MemorySSAAnalysis>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index de2dccef804a..0830cbd919a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -76,10 +76,11 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/RegisterBank.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #define GET_TARGET_REGBANK_IMPL @@ -193,9 +194,7 @@ public: } AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) - : AMDGPUGenRegisterBankInfo(), - Subtarget(ST), - TRI(Subtarget.getRegisterInfo()), + : Subtarget(ST), TRI(Subtarget.getRegisterInfo()), TII(Subtarget.getInstrInfo()) { // HACK: Until this is fully tablegen'd. @@ -428,11 +427,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } } -static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { - const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); - return I && I->getMetadata("amdgpu.noclobber"); -} - // FIXME: Returns uniform if there's no source value information. This is // probably wrong. static bool isScalarLoadLegal(const MachineInstr &MI) { @@ -451,7 +445,7 @@ static bool isScalarLoadLegal(const MachineInstr &MI) { // spaces. (IsConst || !MMO->isVolatile()) && // Memory must be known constant, or not written before this load. - (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && + (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && AMDGPUInstrInfo::isUniformMMO(MMO); } @@ -684,6 +678,62 @@ static LLT getHalfSizedType(LLT Ty) { return LLT::scalar(Ty.getScalarSizeInBits() / 2); } +// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector +// source value into a scalar register. +Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register Src) const { + LLT Ty = MRI.getType(Src); + const RegisterBank *Bank = getRegBank(Src, MRI, *TRI); + + if (Bank == &AMDGPU::SGPRRegBank) + return Src; + + unsigned Bits = Ty.getSizeInBits(); + assert(Bits % 32 == 0); + + if (Bank != &AMDGPU::VGPRRegBank) { + // We need to copy from AGPR to VGPR + Src = B.buildCopy(Ty, Src).getReg(0); + MRI.setRegBank(Src, AMDGPU::VGPRRegBank); + } + + LLT S32 = LLT::scalar(32); + unsigned NumParts = Bits / 32; + SmallVector<Register, 8> SrcParts; + SmallVector<Register, 8> DstParts; + + if (Bits == 32) { + SrcParts.push_back(Src); + } else { + auto Unmerge = B.buildUnmerge(S32, Src); + for (unsigned i = 0; i < NumParts; ++i) + SrcParts.push_back(Unmerge.getReg(i)); + } + + for (unsigned i = 0; i < NumParts; ++i) { + Register SrcPart = SrcParts[i]; + Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + MRI.setType(DstPart, NumParts == 1 ? Ty : S32); + + const TargetRegisterClass *Constrained = + constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI); + (void)Constrained; + assert(Constrained && "Failed to constrain readfirstlane src reg"); + + B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart}); + + DstParts.push_back(DstPart); + } + + if (Bits == 32) + return DstParts[0]; + + Register Dst = B.buildMerge(Ty, DstParts).getReg(0); + MRI.setRegBank(Dst, AMDGPU::SGPRRegBank); + return Dst; +} + /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If /// any of the required SGPR operands are VGPRs, perform a waterfall loop to /// execute the instruction for each unique combination of values in all lanes @@ -716,8 +766,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineFunction *MF = &B.getMF(); const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); - const unsigned WaveAndOpc = Subtarget.isWave32() ? - AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; const unsigned MovExecOpc = Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; const unsigned MovExecTermOpc = @@ -747,16 +795,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // To insert the loop we need to split the block. Move everything before this // point to a new block, and insert a new empty block before this instruction. MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); MachineFunction::iterator MBBI(MBB); ++MBBI; MF->insert(MBBI, LoopBB); + MF->insert(MBBI, BodyBB); MF->insert(MBBI, RestoreExecBB); MF->insert(MBBI, RemainderBB); - LoopBB->addSuccessor(RestoreExecBB); - LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(RestoreExecBB); + BodyBB->addSuccessor(LoopBB); // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); @@ -768,27 +819,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*LoopBB, LoopBB->end()); B.buildInstr(TargetOpcode::PHI) - .addDef(PhiExec) - .addReg(InitSaveExecReg) - .addMBB(&MBB) - .addReg(NewExec) - .addMBB(LoopBB); + .addDef(PhiExec) + .addReg(InitSaveExecReg) + .addMBB(&MBB) + .addReg(NewExec) + .addMBB(BodyBB); const DebugLoc &DL = B.getDL(); MachineInstr &FirstInst = *Range.begin(); - // Move the instruction into the loop. Note we moved everything after + // Move the instruction into the loop body. Note we moved everything after // Range.end() already into a new block, so Range.end() is no longer valid. - LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); // Figure out the iterator range after splicing the instructions. MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); - auto NewEnd = LoopBB->end(); + auto NewEnd = BodyBB->end(); - MachineBasicBlock::iterator I = Range.begin(); - B.setInsertPt(*LoopBB, I); + B.setMBB(*LoopBB); + LLT S1 = LLT::scalar(1); Register CondReg; assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); @@ -819,164 +870,62 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setMBB(MBB); OpReg = B.buildCopy(OpTy, OpReg).getReg(0); MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); - B.setInstr(*I); + B.setMBB(*LoopBB); } - unsigned OpSize = OpTy.getSizeInBits(); - - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); - - constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(OpReg); + Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg); - Register NewCondReg = MRI.createVirtualRegister(WaveRC); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(OpReg); - Op.setReg(CurrentLaneOpReg); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(WaveRC); + // Build the comparison(s). + unsigned OpSize = OpTy.getSizeInBits(); + bool Is64 = OpSize % 64 == 0; + unsigned PartSize = Is64 ? 64 : 32; + LLT PartTy = LLT::scalar(PartSize); + unsigned NumParts = OpSize / PartSize; + SmallVector<Register, 8> OpParts; + SmallVector<Register, 8> CurrentLaneParts; - // If there are multiple operands to consider, and the conditions. - B.buildInstr(WaveAndOpc) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } + if (NumParts == 1) { + OpParts.push_back(OpReg); + CurrentLaneParts.push_back(CurrentLaneReg); } else { - LLT S32 = LLT::scalar(32); - SmallVector<Register, 8> ReadlanePieces; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - bool Is64 = OpSize % 64 == 0; - - unsigned UnmergeTySize = Is64 ? 64 : 32; - unsigned CmpOp = - Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64; - - // Insert the unmerge before the loop. - - B.setMBB(MBB); - unsigned NumPieces = OpSize / UnmergeTySize; - SmallVector<Register, 8> UnmergePieces; - if (NumPieces == 1) { - UnmergePieces.push_back(OpReg); - } else { - LLT UnmergeTy = LLT::scalar(UnmergeTySize); - MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg); - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) - UnmergePieces.push_back(Unmerge.getReg(PieceIdx)); + auto UnmergeOp = B.buildUnmerge(PartTy, OpReg); + auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg); + for (unsigned i = 0; i < NumParts; ++i) { + OpParts.push_back(UnmergeOp.getReg(i)); + CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i)); + MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank); + MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank); } - B.setInstr(*I); - - for (Register UnmergePiece : UnmergePieces) { - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); - - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); - - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); - - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); - - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); - } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); - } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } - - Register NewCondReg = MRI.createVirtualRegister(WaveRC); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - B.buildInstr(CmpOp) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + } - if (!First) { - Register AndReg = MRI.createVirtualRegister(WaveRC); + for (unsigned i = 0; i < NumParts; ++i) { + auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i], + OpParts[i]).getReg(0); + MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank); - // If there are multiple operands to consider, and the conditions. - B.buildInstr(WaveAndOpc) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } - - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); - } else if (ReadlanePieces.size() > 1) { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); + if (!CondReg) { + CondReg = CmpReg; } else { - Op.setReg(ReadlanePieces[0]); + CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0); + MRI.setRegBank(CondReg, AMDGPU::VCCRegBank); } } + Op.setReg(CurrentLaneReg); + // Make sure we don't re-process this register again. WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); } } + // The ballot becomes a no-op during instruction selection. + CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot, + {LLT::scalar(Subtarget.isWave32() ? 32 : 64)}, + false) + .addReg(CondReg) + .getReg(0); + MRI.setRegClass(CondReg, WaveRC); + // Update EXEC, save the original EXEC value to VCC. B.buildInstr(AndSaveExecOpc) .addDef(NewExec) @@ -984,7 +933,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MRI.setSimpleHint(NewExec, CondReg); - B.setInsertPt(*LoopBB, LoopBB->end()); + B.setInsertPt(*BodyBB, BodyBB->end()); // Update EXEC, switch all done bits to 0 and all todo bits to 1. B.buildInstr(XorTermOpc) @@ -1064,28 +1013,10 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( if (Bank == &AMDGPU::SGPRRegBank) return; - LLT Ty = MRI.getType(Reg); MachineIRBuilder B(MI); - if (Bank != &AMDGPU::VGPRRegBank) { - // We need to copy from AGPR to VGPR - Reg = B.buildCopy(Ty, Reg).getReg(0); - MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); - } - - Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) - .addDef(SGPR) - .addReg(Reg); - - MRI.setType(SGPR, Ty); - - const TargetRegisterClass *Constrained = - constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); - (void)Constrained; - assert(Constrained && "Failed to constrain readfirstlane src reg"); - - MI.getOperand(OpIdx).setReg(SGPR); + Reg = buildReadFirstLane(B, MRI, Reg); + MI.getOperand(OpIdx).setReg(Reg); } /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the @@ -1624,6 +1555,157 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper, return true; } +bool AMDGPURegisterBankInfo::applyMappingMAD_64_32( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + // Insert basic copies. + applyDefaultMapping(OpdMapper); + + Register Dst0 = MI.getOperand(0).getReg(); + Register Dst1 = MI.getOperand(1).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register Src1 = MI.getOperand(3).getReg(); + Register Src2 = MI.getOperand(4).getReg(); + + if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank) + return true; + + bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; + LLT S1 = LLT::scalar(1); + LLT S32 = LLT::scalar(32); + + bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank; + bool Accumulate = true; + + if (!DstOnValu) { + if (mi_match(Src2, MRI, m_ZeroInt())) + Accumulate = false; + } + + // Keep the multiplication on the SALU. + MachineIRBuilder B(MI); + + Register DstHi; + Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0); + bool MulHiInVgpr = false; + + MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank); + + if (Subtarget.hasSMulHi()) { + DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0) + : B.buildSMulH(S32, Src0, Src1).getReg(0); + MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank); + } else { + Register VSrc0 = B.buildCopy(S32, Src0).getReg(0); + Register VSrc1 = B.buildCopy(S32, Src1).getReg(0); + + MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank); + MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank); + + DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0) + : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0); + MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); + + if (!DstOnValu) { + DstHi = buildReadFirstLane(B, MRI, DstHi); + } else { + MulHiInVgpr = true; + } + } + + // Accumulate and produce the "carry-out" bit. + // + // The "carry-out" is defined as bit 64 of the result when computed as a + // big integer. For unsigned multiply-add, this matches the usual definition + // of carry-out. For signed multiply-add, bit 64 is the sign bit of the + // result, which is determined as: + // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add + LLT CarryType = DstOnValu ? S1 : S32; + const RegisterBank &CarryBank = + DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; + const RegisterBank &DstBank = + DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank; + Register Carry; + Register Zero; + + if (!IsUnsigned) { + Zero = B.buildConstant(S32, 0).getReg(0); + MRI.setRegBank(Zero, + MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank); + + Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero) + .getReg(0); + MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank + : AMDGPU::SGPRRegBank); + + if (DstOnValu && !MulHiInVgpr) { + Carry = B.buildTrunc(S1, Carry).getReg(0); + MRI.setRegBank(Carry, AMDGPU::VCCRegBank); + } + } + + if (Accumulate) { + if (DstOnValu) { + DstLo = B.buildCopy(S32, DstLo).getReg(0); + DstHi = B.buildCopy(S32, DstHi).getReg(0); + MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank); + MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank); + } + + auto Unmerge = B.buildUnmerge(S32, Src2); + Register Src2Lo = Unmerge.getReg(0); + Register Src2Hi = Unmerge.getReg(1); + MRI.setRegBank(Src2Lo, DstBank); + MRI.setRegBank(Src2Hi, DstBank); + + if (!IsUnsigned) { + auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero); + MRI.setRegBank(Src2Sign.getReg(0), CarryBank); + + Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0); + MRI.setRegBank(Carry, CarryBank); + } + + auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo); + DstLo = AddLo.getReg(0); + Register CarryLo = AddLo.getReg(1); + MRI.setRegBank(DstLo, DstBank); + MRI.setRegBank(CarryLo, CarryBank); + + auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo); + DstHi = AddHi.getReg(0); + MRI.setRegBank(DstHi, DstBank); + + Register CarryHi = AddHi.getReg(1); + MRI.setRegBank(CarryHi, CarryBank); + + if (IsUnsigned) { + Carry = CarryHi; + } else { + Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0); + MRI.setRegBank(Carry, CarryBank); + } + } else { + if (IsUnsigned) { + Carry = B.buildConstant(CarryType, 0).getReg(0); + MRI.setRegBank(Carry, CarryBank); + } + } + + B.buildMerge(Dst0, {DstLo, DstHi}); + + if (DstOnValu) { + B.buildCopy(Dst1, Carry); + } else { + B.buildTrunc(Dst1, Carry); + } + + MI.eraseFromParent(); + return true; +} + // Return a suitable opcode for extending the operands of Opc when widening. static unsigned getExtendOp(unsigned Opc) { switch (Opc) { @@ -1794,7 +1876,7 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, } /// Utility function for pushing dynamic vector indexes with a constant offset -/// into waterwall loops. +/// into waterfall loops. static void reinsertVectorIndexAdd(MachineIRBuilder &B, MachineInstr &IdxUseInstr, unsigned OpIdx, @@ -1857,7 +1939,7 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( unsigned NumElem = VecTy.getNumElements(); if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, - IsDivergentIdx)) + IsDivergentIdx, &Subtarget)) return false; MachineIRBuilder B(MI); @@ -1955,7 +2037,7 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( unsigned NumElem = VecTy.getNumElements(); if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, - IsDivergentIdx)) + IsDivergentIdx, &Subtarget)) return false; MachineIRBuilder B(MI); @@ -2926,7 +3008,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: case Intrinsic::amdgcn_interp_p1_f16: - case Intrinsic::amdgcn_interp_p2_f16: { + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_lds_param_load: { applyDefaultMapping(OpdMapper); // Readlane for m0 value, which is always the last operand. @@ -2934,6 +3017,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index return; } + case Intrinsic::amdgcn_interp_inreg_p10: + case Intrinsic::amdgcn_interp_inreg_p2: + case Intrinsic::amdgcn_interp_inreg_p10_f16: + case Intrinsic::amdgcn_interp_inreg_p2_f16: + applyDefaultMapping(OpdMapper); + return; case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: { // Doing a waterfall loop over these wouldn't make any sense. @@ -3015,6 +3104,35 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(MI, MRI, 2); return; } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(MI, MRI, 5); // soffset + return; + } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + constrainOpWithReadfirstlane(MI, MRI, 6); // soffset + return; + } + case Intrinsic::amdgcn_global_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 2); + return; + } + case Intrinsic::amdgcn_lds_direct_load: { + applyDefaultMapping(OpdMapper); + // Readlane for m0 value, which is always the last operand. + constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index + return; + } + case Intrinsic::amdgcn_exp_row: + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 8); // M0 + return; default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -3143,6 +3261,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_UBFX: applyMappingBFE(OpdMapper, /*Signed*/ false); return; + case AMDGPU::G_AMDGPU_MAD_U64_U32: + case AMDGPU::G_AMDGPU_MAD_I64_I32: + applyMappingMAD_64_32(OpdMapper); + return; default: break; } @@ -3668,6 +3790,48 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); } + case AMDGPU::G_AMDGPU_MAD_U64_U32: + case AMDGPU::G_AMDGPU_MAD_I64_I32: { + // Three possible mappings: + // + // - Default SOP + // - Default VOP + // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. + // + // This allows instruction selection to keep the multiplication part of the + // instruction on the SALU. + bool AllSalu = true; + bool MulSalu = true; + for (unsigned i = 0; i < 5; ++i) { + Register Reg = MI.getOperand(i).getReg(); + if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { + if (Bank->getID() != AMDGPU::SGPRRegBankID) { + AllSalu = false; + if (i == 2 || i == 3) { + MulSalu = false; + break; + } + } + } + } + + if (AllSalu) + return getDefaultMappingSOP(MI); + + // If the multiply-add is full-rate in VALU, use that even if the + // multiplication part is scalar. Accumulating separately on the VALU would + // take two instructions. + if (!MulSalu || Subtarget.hasFullRate64Ops()) + return getDefaultMappingVOP(MI); + + // Keep the multiplication on the SALU, then accumulate on the VALU. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); + break; + } case AMDGPU::G_IMPLICIT_DEF: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); @@ -3828,10 +3992,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case AMDGPU::G_FCMP: { unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); - unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); OpdsMapping[1] = nullptr; // Predicate Operand. - OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); break; } @@ -4102,6 +4265,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: + case Intrinsic::amdgcn_fdot2_bf16_bf16: + case Intrinsic::amdgcn_fdot2_f16_f16: + case Intrinsic::amdgcn_fdot2_f32_bf16: + case Intrinsic::amdgcn_sudot4: + case Intrinsic::amdgcn_sudot8: + case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_ubfe: @@ -4120,6 +4294,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_permlane64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_kernarg_segment_ptr: case Intrinsic::amdgcn_s_getpc: @@ -4247,24 +4422,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: case Intrinsic::amdgcn_mfma_f64_16x16x4f64: - case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { + case Intrinsic::amdgcn_mfma_f64_4x4x4f64: + case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: + case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: + case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: + case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src // for srcA/srcB? // // vdst, srcA, srcB, srcC + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + OpdsMapping[0] = + Info->mayNeedAGPRs() + ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[4] = + Info->mayNeedAGPRs() + ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: + case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: + case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: { + // vdst, srcA, srcB, srcC, idx OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: case Intrinsic::amdgcn_interp_p1_f16: - case Intrinsic::amdgcn_interp_p2_f16: { + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_lds_param_load: { const int M0Idx = MI.getNumOperands() - 1; Register M0Reg = MI.getOperand(M0Idx).getReg(); unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); @@ -4279,6 +4480,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); break; } + case Intrinsic::amdgcn_interp_inreg_p10: + case Intrinsic::amdgcn_interp_inreg_p2: + case Intrinsic::amdgcn_interp_inreg_p10_f16: + case Intrinsic::amdgcn_interp_inreg_p2_f16: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + break; + } case Intrinsic::amdgcn_ballot: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); @@ -4314,8 +4526,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } else { // NSA form - for (unsigned I = 2; I < N; ++I) - OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + for (unsigned I = 2; I < N; ++I) { + unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } } break; } @@ -4325,7 +4539,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_s_getreg: case Intrinsic::amdgcn_s_memtime: case Intrinsic::amdgcn_s_memrealtime: - case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { + case Intrinsic::amdgcn_s_get_waveid_in_workgroup: + case Intrinsic::amdgcn_s_sendmsg_rtn: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -4337,6 +4552,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { @@ -4366,6 +4583,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); break; + case Intrinsic::amdgcn_exp_row: + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); + break; case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // This must be an SGPR, but accept a VGPR. @@ -4412,6 +4636,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_raw_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: { @@ -4430,6 +4661,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_struct_buffer_load_lds: { + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); @@ -4464,6 +4703,31 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_load_lds: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } + case Intrinsic::amdgcn_lds_direct_load: { + const int M0Idx = MI.getNumOperands() - 1; + Register M0Reg = MI.getOperand(M0Idx).getReg(); + unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // Must be SGPR, but we must take whatever the original bank is and fix it + // later. + OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } + case Intrinsic::amdgcn_ds_add_gs_reg_rtn: + case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); } @@ -4568,6 +4832,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); break; } + case AMDGPU::G_FPTRUNC_ROUND_UPWARD: + case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD: + return getDefaultMappingVOP(MI); } return getInstructionMapping(/*ID*/1, /*Cost*/1, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 2b9d0923ab49..c9741c2202e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -16,7 +16,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #define GET_REGBANK_DECLARATIONS #include "AMDGPUGenRegisterBank.inc" @@ -59,6 +59,9 @@ public: SmallSet<Register, 4> &SGPROperandRegs, MachineRegisterInfo &MRI) const; + Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register Src) const; + bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, @@ -83,6 +86,8 @@ public: bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const; + bool applyMappingMAD_64_32(const OperandsMapper &OpdMapper) const; + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp new file mode 100644 index 000000000000..a86871a4a653 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp @@ -0,0 +1,140 @@ +//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert S_SENDMSG instructions to release vgprs on GFX11+. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineOperand.h" +using namespace llvm; + +#define DEBUG_TYPE "release-vgprs" + +namespace { + +class AMDGPUReleaseVGPRs : public MachineFunctionPass { +public: + static char ID; + + const SIInstrInfo *SII; + const SIRegisterInfo *TRI; + + AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Used to cache the result of isLastInstructionVMEMStore for each block + using BlockVMEMStoreType = DenseMap<MachineBasicBlock *, bool>; + BlockVMEMStoreType BlockVMEMStore; + + // Return true if the last instruction referencing a vgpr in this MBB + // is a VMEM store, otherwise return false. + // Visit previous basic blocks to find this last instruction if needed. + // Because this pass is late in the pipeline, it is expected that the + // last vgpr use will likely be one of vmem store, ds, exp. + // Loads and others vgpr operations would have been + // deleted by this point, except for complex control flow involving loops. + // This is why we are just testing the type of instructions rather + // than the operands. + bool isLastVGPRUseVMEMStore(MachineBasicBlock &MBB) { + // Use the cache to break infinite loop and save some time. Initialize to + // false in case we have a cycle. + BlockVMEMStoreType::iterator It; + bool Inserted; + std::tie(It, Inserted) = BlockVMEMStore.insert({&MBB, false}); + bool &CacheEntry = It->second; + if (!Inserted) + return CacheEntry; + + for (auto &MI : reverse(MBB.instrs())) { + // If it's a VMEM store, a vgpr will be used, return true. + if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) && MI.mayStore()) + return CacheEntry = true; + + // If it's referencing a VGPR but is not a VMEM store, return false. + if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) || + SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) || + SIInstrInfo::isVALU(MI)) + return CacheEntry = false; + } + + // Recursive call into parent blocks. Look into predecessors if there is no + // vgpr used in this block. + return CacheEntry = llvm::any_of(MBB.predecessors(), + [this](MachineBasicBlock *Parent) { + return isLastVGPRUseVMEMStore(*Parent); + }); + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB) { + + bool Changed = false; + + for (auto &MI : MBB.terminators()) { + // Look for S_ENDPGM instructions + if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + // If the last instruction using a VGPR in the block is a VMEM store, + // release VGPRs. The VGPRs release will be placed just before ending + // the program + if (isLastVGPRUseVMEMStore(MBB)) { + BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG)) + .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + Changed = true; + } + } + } + + return Changed; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + Function &F = MF.getFunction(); + if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + // This pass only runs on GFX11+ + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (ST.getGeneration() < AMDGPUSubtarget::GFX11) + return false; + + LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName() + << "\n"); + + SII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + bool Changed = false; + for (auto &MBB : MF) { + Changed |= runOnMachineBasicBlock(MBB); + } + + BlockVMEMStore.clear(); + + return Changed; + } +}; + +} // namespace + +char AMDGPUReleaseVGPRs::ID = 0; + +char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID; + +INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp index 2475b44b42a3..4d7a3f4028e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -83,7 +83,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "Utils/AMDGPULDSUtils.h" +#include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" @@ -442,7 +442,7 @@ class CollectReachableCallees { continue; for (const auto &GI : *CGN) { - auto *RCB = cast<CallBase>(GI.first.getValue()); + auto *RCB = cast<CallBase>(*GI.first); auto *RCGN = GI.second; if (auto *DCallee = RCGN->getFunction()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index cb511e5e3483..f7f93c75c870 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -27,7 +27,9 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" @@ -87,9 +89,7 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { - if (ST.hasGFX90AInsts() && ArgNumAGPR) - return alignTo(ArgNumVGPR, 4) + ArgNumAGPR; - return std::max(ArgNumVGPR, ArgNumAGPR); + return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); } int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( @@ -97,28 +97,31 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); } -bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) { +bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); if (!TPC) return false; + MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); const TargetMachine &TM = TPC->getTM<TargetMachine>(); bool HasIndirectCall = false; - for (CallGraphNode *I : SCC) { - Function *F = I->getFunction(); + CallGraph CG = CallGraph(M); + auto End = po_end(&CG); + + for (auto IT = po_begin(&CG); IT != End; ++IT) { + Function *F = IT->getFunction(); if (!F || F->isDeclaration()) continue; - MachineModuleInfo &MMI = - getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); - MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + MachineFunction *MF = MMI.getMachineFunction(*F); + assert(MF && "function must have been generated already"); auto CI = CallGraphResourceInfo.insert( - std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); + std::make_pair(F, SIFunctionResourceInfo())); SIFunctionResourceInfo &Info = CI.first->second; assert(CI.second && "should only be called once per function"); - Info = analyzeResourceUsage(MF, TM); + Info = analyzeResourceUsage(*MF, TM); HasIndirectCall |= Info.HasIndirectCall; } @@ -246,6 +249,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: case AMDGPU::MODE: continue; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h index b0a2d3bffc62..df0789e471c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -24,7 +24,7 @@ class GCNSubtarget; class MachineFunction; class TargetMachine; -struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass { +struct AMDGPUResourceUsageAnalysis : public ModulePass { static char ID; public: @@ -50,15 +50,15 @@ public: int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; }; - AMDGPUResourceUsageAnalysis() : CallGraphSCCPass(ID) {} + AMDGPUResourceUsageAnalysis() : ModulePass(ID) {} - bool runOnSCC(CallGraphSCC &SCC) override; - - bool doInitialization(CallGraph &CG) override { + bool doInitialization(Module &M) override { CallGraphResourceInfo.clear(); - return CallGraphSCCPass::doInitialization(CG); + return ModulePass::doInitialization(M); } + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineModuleInfoWrapperPass>(); AU.setPreservesAll(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 1c6c63dd5b25..4f8a61a77097 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -83,12 +83,8 @@ private: const DataLayout *DL = nullptr; MemoryDependenceResults *MDA = nullptr; - bool checkArgumentUses(Value &Arg) const; - bool isOutArgumentCandidate(Argument &Arg) const; - -#ifndef NDEBUG - bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const; -#endif + Type *getStoredType(Value &Arg) const; + Type *getOutArgumentType(Argument &Arg) const; public: static char ID; @@ -114,72 +110,61 @@ INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE, char AMDGPURewriteOutArguments::ID = 0; -bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const { +Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const { const int MaxUses = 10; int UseCount = 0; - for (Use &U : Arg.uses()) { - StoreInst *SI = dyn_cast<StoreInst>(U.getUser()); - if (UseCount > MaxUses) - return false; - - if (!SI) { - auto *BCI = dyn_cast<BitCastInst>(U.getUser()); - if (!BCI || !BCI->hasOneUse()) - return false; + SmallVector<Use *> Worklist; + for (Use &U : Arg.uses()) + Worklist.push_back(&U); - // We don't handle multiple stores currently, so stores to aggregate - // pointers aren't worth the trouble since they are canonically split up. - Type *DestEltTy = BCI->getType()->getPointerElementType(); - if (DestEltTy->isAggregateType()) - return false; + Type *StoredType = nullptr; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); - // We could handle these if we had a convenient way to bitcast between - // them. - Type *SrcEltTy = Arg.getType()->getPointerElementType(); - if (SrcEltTy->isArrayTy()) - return false; + if (auto *BCI = dyn_cast<BitCastInst>(U->getUser())) { + for (Use &U : BCI->uses()) + Worklist.push_back(&U); + continue; + } - // Special case handle structs with single members. It is useful to handle - // some casts between structs and non-structs, but we can't bitcast - // directly between them. Blender uses some casts that look like - // { <3 x float> }* to <4 x float>* - if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1))) - return false; + if (auto *SI = dyn_cast<StoreInst>(U->getUser())) { + if (UseCount++ > MaxUses) + return nullptr; - // Clang emits OpenCL 3-vector type accesses with a bitcast to the - // equivalent 4-element vector and accesses that, and we're looking for - // this pointer cast. - if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy)) - return false; + if (!SI->isSimple() || + U->getOperandNo() != StoreInst::getPointerOperandIndex()) + return nullptr; - return checkArgumentUses(*BCI); + if (StoredType && StoredType != SI->getValueOperand()->getType()) + return nullptr; // More than one type. + StoredType = SI->getValueOperand()->getType(); + continue; } - if (!SI->isSimple() || - U.getOperandNo() != StoreInst::getPointerOperandIndex()) - return false; - - ++UseCount; + // Unsupported user. + return nullptr; } - // Skip unused arguments. - return UseCount > 0; + return StoredType; } -bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const { +Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const { const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs; PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType()); // TODO: It might be useful for any out arguments, not just privates. if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() && !AnyAddressSpace) || - Arg.hasByValAttr() || Arg.hasStructRetAttr() || - DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) { - return false; + Arg.hasByValAttr() || Arg.hasStructRetAttr()) { + return nullptr; } - return checkArgumentUses(Arg); + Type *StoredType = getStoredType(Arg); + if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes) + return nullptr; + + return StoredType; } bool AMDGPURewriteOutArguments::doInitialization(Module &M) { @@ -187,22 +172,6 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) { return false; } -#ifndef NDEBUG -bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const { - auto *VT0 = dyn_cast<FixedVectorType>(Ty0); - auto *VT1 = dyn_cast<FixedVectorType>(Ty1); - if (!VT0 || !VT1) - return false; - - if (VT0->getNumElements() != 3 || - VT1->getNumElements() != 4) - return false; - - return DL->getTypeSizeInBits(VT0->getElementType()) == - DL->getTypeSizeInBits(VT1->getElementType()); -} -#endif - bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -215,7 +184,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); unsigned ReturnNumRegs = 0; - SmallSet<int, 4> OutArgIndexes; + SmallDenseMap<int, Type *, 4> OutArgIndexes; SmallVector<Type *, 4> ReturnTypes; Type *RetTy = F.getReturnType(); if (!RetTy->isVoidTy()) { @@ -227,12 +196,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { ReturnTypes.push_back(RetTy); } - SmallVector<Argument *, 4> OutArgs; + SmallVector<std::pair<Argument *, Type *>, 4> OutArgs; for (Argument &Arg : F.args()) { - if (isOutArgumentCandidate(Arg)) { + if (Type *Ty = getOutArgumentType(Arg)) { LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg << " in function " << F.getName() << '\n'); - OutArgs.push_back(&Arg); + OutArgs.push_back({&Arg, Ty}); } } @@ -264,11 +233,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { // first. On the second iteration we've removed that out clobbering argument // (by effectively moving it into another function) and will find the second // argument is OK to move. - for (Argument *OutArg : OutArgs) { + for (const auto &Pair : OutArgs) { bool ThisReplaceable = true; SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores; - Type *ArgTy = OutArg->getType()->getPointerElementType(); + Argument *OutArg = Pair.first; + Type *ArgTy = Pair.second; // Skip this argument if converting it will push us over the register // count to return limit. @@ -324,7 +294,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (ThisReplaceable) { ReturnTypes.push_back(ArgTy); - OutArgIndexes.insert(OutArg->getArgNo()); + OutArgIndexes.insert({OutArg->getArgNo(), ArgTy}); ++NumOutArgumentsReplaced; Changing = true; } @@ -376,32 +346,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (RetVal) NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++); - for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) { - Argument *Arg = ReturnPoint.first; - Value *Val = ReturnPoint.second; - Type *EltTy = Arg->getType()->getPointerElementType(); - if (Val->getType() != EltTy) { - Type *EffectiveEltTy = EltTy; - if (StructType *CT = dyn_cast<StructType>(EltTy)) { - assert(CT->getNumElements() == 1); - EffectiveEltTy = CT->getElementType(0); - } - - if (DL->getTypeSizeInBits(EffectiveEltTy) != - DL->getTypeSizeInBits(Val->getType())) { - assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType())); - Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2}); - } - - Val = B.CreateBitCast(Val, EffectiveEltTy); - - // Re-create single element composite. - if (EltTy != EffectiveEltTy) - Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0); - } - - NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++); - } + for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) + NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++); if (RetVal) RI->setOperand(0, NewRetVal); @@ -433,7 +379,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { PointerType *ArgType = cast<PointerType>(Arg.getType()); - auto *EltTy = ArgType->getPointerElementType(); + Type *EltTy = OutArgIndexes[Arg.getArgNo()]; const auto Align = DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index afe016731395..8297635d7bb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -39,7 +39,8 @@ class GcnBufferFormatBase<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bi } class Gfx9BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; -class Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; +class Gfx10BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; +class Gfx11PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; class GcnBufferFormatTable : GenericTable { let CppTypeName = "GcnBufferFormatInfo"; @@ -51,17 +52,25 @@ def Gfx9BufferFormat : GcnBufferFormatTable { let FilterClass = "Gfx9BufferFormat"; let PrimaryKeyName = "getGfx9BufferFormatInfo"; } -def Gfx10PlusBufferFormat : GcnBufferFormatTable { - let FilterClass = "Gfx10PlusBufferFormat"; - let PrimaryKeyName = "getGfx10PlusBufferFormatInfo"; +def Gfx10BufferFormat : GcnBufferFormatTable { + let FilterClass = "Gfx10BufferFormat"; + let PrimaryKeyName = "getGfx10BufferFormatInfo"; +} +def Gfx11PlusBufferFormat : GcnBufferFormatTable { + let FilterClass = "Gfx11PlusBufferFormat"; + let PrimaryKeyName = "getGfx11PlusBufferFormatInfo"; } def getGfx9BufferFormatInfo : SearchIndex { let Table = Gfx9BufferFormat; let Key = ["Format"]; } -def getGfx10PlusBufferFormatInfo : SearchIndex { - let Table = Gfx10PlusBufferFormat; +def getGfx10BufferFormatInfo : SearchIndex { + let Table = Gfx10BufferFormat; + let Key = ["Format"]; +} +def getGfx11PlusBufferFormatInfo : SearchIndex { + let Table = Gfx11PlusBufferFormat; let Key = ["Format"]; } @@ -119,57 +128,87 @@ def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x5E, 32, 4, /*NUM_FORMA def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x7E, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; // Buffer formats with equal component sizes (GFX10 and later) -def : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/ 0x02, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/ 0x03, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/ 0x04, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/ 0x05, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/ 0x06, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/ 0x07, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/ 0x08, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/ 0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/ 0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/ 0x0B, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/ 0x0C, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/ 0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/ 0x0E, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/ 0x0F, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/ 0x10, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/ 0x11, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/ 0x12, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/ 0x13, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/ 0x14, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/ 0x15, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/ 0x16, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/ 0x17, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/ 0x18, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/ 0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/ 0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/ 0x1B, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/ 0x1C, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/ 0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x38, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x39, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3B, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x3C, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x3D, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_UINT*/ 0x3E, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_SINT*/ 0x3F, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_FLOAT*/ 0x40, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x41, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x42, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x45, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x46, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x47, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_UINT*/ 0x48, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_SINT*/ 0x49, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4B, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x4C, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>; -def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; +multiclass Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> { + def : Gfx10BufferFormat<f, bpc, numc, nfmt, dfmt>; + def : Gfx11PlusBufferFormat<f, bpc, numc, nfmt, dfmt>; +} +defm : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/ 0x02, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/ 0x03, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/ 0x04, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/ 0x05, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/ 0x06, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/ 0x07, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/ 0x08, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/ 0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/ 0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/ 0x0B, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/ 0x0C, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/ 0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/ 0x0E, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/ 0x0F, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/ 0x10, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/ 0x11, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/ 0x12, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/ 0x13, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>; +defm : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/ 0x14, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>; +defm : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/ 0x15, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>; +defm : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/ 0x16, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/ 0x17, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/ 0x18, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/ 0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/ 0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/ 0x1B, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/ 0x1C, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>; +defm : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/ 0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>; + +// Buffer formats with equal component sizes (GFX10 only) +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x38, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x39, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3B, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x3C, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x3D, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10BufferFormat< /*FORMAT_32_32_UINT*/ 0x3E, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10BufferFormat< /*FORMAT_32_32_SINT*/ 0x3F, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10BufferFormat< /*FORMAT_32_32_FLOAT*/ 0x40, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x41, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x42, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x45, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x46, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x47, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_UINT*/ 0x48, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_SINT*/ 0x49, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4B, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x4C, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx10BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; + +// Buffer formats with equal component sizes (GFX11 and later) +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x2A, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x2B, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x2C, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x2D, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x2E, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x2F, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_UINT*/ 0x30, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_SINT*/ 0x31, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_FLOAT*/ 0x32, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x33, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x34, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x35, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x36, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x37, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x38, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x39, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_UINT*/ 0x3A, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_SINT*/ 0x3B, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x3C, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x3D, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x3E, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx11PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x3F, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; class SourceOfDivergence<Intrinsic intr> { Intrinsic Intr = intr; @@ -191,6 +230,8 @@ def : SourceOfDivergence<int_amdgcn_interp_p1>; def : SourceOfDivergence<int_amdgcn_interp_p2>; def : SourceOfDivergence<int_amdgcn_interp_p1_f16>; def : SourceOfDivergence<int_amdgcn_interp_p2_f16>; +def : SourceOfDivergence<int_amdgcn_lds_direct_load>; +def : SourceOfDivergence<int_amdgcn_lds_param_load>; def : SourceOfDivergence<int_amdgcn_mbcnt_hi>; def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; @@ -205,9 +246,12 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; +def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>; @@ -292,6 +336,16 @@ def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>; def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>; def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>; def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>; +def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>; +def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>; +def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp new file mode 100644 index 000000000000..34702ee6623b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -0,0 +1,166 @@ +//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Pass to temporarily raise the wave priority beginning the start of +/// the shader function until its last VMEM instructions to allow younger +/// waves to issue their VMEM instructions as well. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Allocator.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-set-wave-priority" + +namespace { + +struct MBBInfo { + MBBInfo() = default; + bool MayReachVMEMLoad = false; +}; + +using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>; + +class AMDGPUSetWavePriority : public MachineFunctionPass { +public: + static char ID; + + AMDGPUSetWavePriority() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Set wave priority"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const; + + const SIInstrInfo *TII; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false, + false) + +char AMDGPUSetWavePriority::ID = 0; + +FunctionPass *llvm::createAMDGPUSetWavePriorityPass() { + return new AMDGPUSetWavePriority(); +} + +MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF, + unsigned priority) const { + return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority); +} + +// Checks that for every predecessor Pred that can reach a VMEM load, +// none of Pred's successors can reach a VMEM load. +static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB, + MBBInfoSet &MBBInfos) { + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + if (!MBBInfos[Pred].MayReachVMEMLoad) + continue; + for (const MachineBasicBlock *Succ : Pred->successors()) { + if (MBBInfos[Succ].MayReachVMEMLoad) + return false; + } + } + return true; +} + +static bool isVMEMLoad(const MachineInstr &MI) { + return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); +} + +bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { + const unsigned HighPriority = 3; + const unsigned LowPriority = 0; + + Function &F = MF.getFunction(); + if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + + MBBInfoSet MBBInfos; + SmallVector<const MachineBasicBlock *, 16> Worklist; + for (MachineBasicBlock &MBB : MF) { + if (any_of(MBB, isVMEMLoad)) + Worklist.push_back(&MBB); + } + + // Mark blocks from which control may reach VMEM loads. + while (!Worklist.empty()) { + const MachineBasicBlock *MBB = Worklist.pop_back_val(); + MBBInfo &Info = MBBInfos[MBB]; + if (!Info.MayReachVMEMLoad) { + Info.MayReachVMEMLoad = true; + Worklist.append(MBB->pred_begin(), MBB->pred_end()); + } + } + + MachineBasicBlock &Entry = MF.front(); + if (!MBBInfos[&Entry].MayReachVMEMLoad) + return false; + + // Raise the priority at the beginning of the shader. + MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end(); + while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator()) + ++I; + Entry.insert(I, BuildSetprioMI(MF, HighPriority)); + + // Lower the priority on edges where control leaves blocks from which + // VMEM loads are reachable. + SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks; + for (MachineBasicBlock &MBB : MF) { + if (MBBInfos[&MBB].MayReachVMEMLoad) { + if (MBB.succ_empty()) + PriorityLoweringBlocks.insert(&MBB); + continue; + } + + if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) { + for (MachineBasicBlock *Pred : MBB.predecessors()) { + if (MBBInfos[Pred].MayReachVMEMLoad) + PriorityLoweringBlocks.insert(Pred); + } + continue; + } + + // Where lowering the priority in predecessors is not possible, the + // block receiving control either was not part of a loop in the first + // place or the loop simplification/canonicalization pass should have + // already tried to split the edge and insert a preheader, and if for + // whatever reason it failed to do so, then this leaves us with the + // only option of lowering the priority within the loop. + PriorityLoweringBlocks.insert(&MBB); + } + + for (MachineBasicBlock *MBB : PriorityLoweringBlocks) { + MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin(); + while (I != B) { + if (isVMEMLoad(*--I)) { + ++I; + break; + } + } + MBB->insert(I, BuildSetprioMI(MF, LowPriority)); + } + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index e82f9232b114..77816a783630 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -50,11 +50,6 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt<bool> EnableFlatScratch( - "amdgpu-enable-flat-scratch", - cl::desc("Use flat scratch instructions"), - cl::init(false)); - static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true)); @@ -159,26 +154,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, return *this; } -AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : - TargetTriple(TT), - GCN3Encoding(false), - Has16BitInsts(false), - HasMadMixInsts(false), - HasMadMacF32Insts(false), - HasDsSrc2Insts(false), - HasSDWA(false), - HasVOP3PInsts(false), - HasMulI24(true), - HasMulU24(true), - HasSMulHi(false), - HasInv2PiInlineImm(false), - HasFminFmaxLegacy(true), - EnablePromoteAlloca(false), - HasTrigReducedRange(false), - MaxWavesPerEU(10), - LocalMemorySize(0), - WavefrontSizeLog2(0) - { } +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {} GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM) @@ -187,120 +163,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AMDGPUSubtarget(TT), TargetTriple(TT), TargetID(*this), - Gen(INVALID), InstrItins(getInstrItineraryForCPU(GPU)), - LDSBankCount(0), - MaxPrivateElementSize(0), - - FastFMAF32(false), - FastDenormalF32(false), - HalfRate64Ops(false), - FullRate64Ops(false), - - FlatForGlobal(false), - AutoWaitcntBeforeBarrier(false), - UnalignedScratchAccess(false), - UnalignedAccessMode(false), - - HasApertureRegs(false), - SupportsXNACK(false), - EnableXNACK(false), - EnableTgSplit(false), - EnableCuMode(false), - TrapHandler(false), - - EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), - EnableSIScheduler(false), - EnableDS128(false), - EnablePRTStrictNull(false), - DumpCode(false), - - FP64(false), - CIInsts(false), - GFX8Insts(false), - GFX9Insts(false), - GFX90AInsts(false), - GFX10Insts(false), - GFX10_3Insts(false), - GFX7GFX8GFX9Insts(false), - SGPRInitBug(false), - NegativeScratchOffsetBug(false), - NegativeUnalignedScratchOffsetBug(false), - HasSMemRealTime(false), - HasIntClamp(false), - HasFmaMixInsts(false), - HasMovrel(false), - HasVGPRIndexMode(false), - HasScalarStores(false), - HasScalarAtomics(false), - HasSDWAOmod(false), - HasSDWAScalar(false), - HasSDWASdst(false), - HasSDWAMac(false), - HasSDWAOutModsVOPC(false), - HasDPP(false), - HasDPP8(false), - Has64BitDPP(false), - HasPackedFP32Ops(false), - HasExtendedImageInsts(false), - HasR128A16(false), - HasGFX10A16(false), - HasG16(false), - HasNSAEncoding(false), - NSAMaxSize(0), - GFX10_AEncoding(false), - GFX10_BEncoding(false), - HasDLInsts(false), - HasDot1Insts(false), - HasDot2Insts(false), - HasDot3Insts(false), - HasDot4Insts(false), - HasDot5Insts(false), - HasDot6Insts(false), - HasDot7Insts(false), - HasMAIInsts(false), - HasPkFmacF16Inst(false), - HasAtomicFaddInsts(false), - SupportsSRAMECC(false), - EnableSRAMECC(false), - HasNoSdstCMPX(false), - HasVscnt(false), - HasGetWaveIdInst(false), - HasSMemTimeInst(false), - HasShaderCyclesRegister(false), - HasVOP3Literal(false), - HasNoDataDepHazard(false), - FlatAddressSpace(false), - FlatInstOffsets(false), - FlatGlobalInsts(false), - FlatScratchInsts(false), - ScalarFlatScratchInsts(false), - HasArchitectedFlatScratch(false), - AddNoCarryInsts(false), - HasUnpackedD16VMem(false), - LDSMisalignedBug(false), - HasMFMAInlineLiteralBug(false), - UnalignedBufferAccess(false), - UnalignedDSAccess(false), - HasPackedTID(false), - - ScalarizeGlobal(false), - - HasVcmpxPermlaneHazard(false), - HasVMEMtoScalarWriteHazard(false), - HasSMEMtoVectorWriteHazard(false), - HasInstFwdPrefetchBug(false), - HasVcmpxExecWARHazard(false), - HasLdsBranchVmemWARHazard(false), - HasNSAtoVMEMBug(false), - HasNSAClauseBug(false), - HasOffset3fBug(false), - HasFlatSegmentOffsetBug(false), - HasImageStoreD16Bug(false), - HasImageGather4D16Bug(false), - - FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { @@ -314,11 +177,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } -bool GCNSubtarget::enableFlatScratch() const { - return flatScratchIsArchitected() || - (EnableFlatScratch && hasFlatScratchInsts()); -} - unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { if (getGeneration() < GFX10) return 1; @@ -326,12 +184,15 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { switch (Opcode) { case AMDGPU::V_LSHLREV_B64_e64: case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_e64_gfx11: case AMDGPU::V_LSHL_B64_e64: case AMDGPU::V_LSHRREV_B64_e64: case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_e64_gfx11: case AMDGPU::V_LSHR_B64_e64: case AMDGPU::V_ASHRREV_I64_e64: case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_e64_gfx11: case AMDGPU::V_ASHR_I64_e64: return 1; } @@ -658,7 +519,8 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { return 16; // Assume all implicit inputs are used by default - return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); + unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56; + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", NBytes); } uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, @@ -673,13 +535,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, for (const Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); - MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; - if (!Alignment) - Alignment = DL.getABITypeAlign(ArgTy); - + Align Alignment = DL.getValueOrABITypeAlignment( + IsByRef ? Arg.getParamAlign() : None, ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; - MaxAlign = max(MaxAlign, Alignment); + MaxAlign = std::max(MaxAlign, Alignment); } return ExplicitArgBytes; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 7f1b94be4ffe..7400c81effd0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -38,30 +38,32 @@ public: SEA_ISLANDS = 6, VOLCANIC_ISLANDS = 7, GFX9 = 8, - GFX10 = 9 + GFX10 = 9, + GFX11 = 10 }; private: Triple TargetTriple; protected: - bool GCN3Encoding; - bool Has16BitInsts; - bool HasMadMixInsts; - bool HasMadMacF32Insts; - bool HasDsSrc2Insts; - bool HasSDWA; - bool HasVOP3PInsts; - bool HasMulI24; - bool HasMulU24; - bool HasSMulHi; - bool HasInv2PiInlineImm; - bool HasFminFmaxLegacy; - bool EnablePromoteAlloca; - bool HasTrigReducedRange; - unsigned MaxWavesPerEU; - unsigned LocalMemorySize; - char WavefrontSizeLog2; + bool GCN3Encoding = false; + bool Has16BitInsts = false; + bool HasTrue16BitInsts = false; + bool HasMadMixInsts = false; + bool HasMadMacF32Insts = false; + bool HasDsSrc2Insts = false; + bool HasSDWA = false; + bool HasVOP3PInsts = false; + bool HasMulI24 = true; + bool HasMulU24 = true; + bool HasSMulHi = false; + bool HasInv2PiInlineImm = false; + bool HasFminFmaxLegacy = true; + bool EnablePromoteAlloca = false; + bool HasTrigReducedRange = false; + unsigned MaxWavesPerEU = 10; + unsigned LocalMemorySize = 0; + char WavefrontSizeLog2 = 0; public: AMDGPUSubtarget(const Triple &TT); @@ -145,6 +147,8 @@ public: return Has16BitInsts; } + bool hasTrue16BitInsts() const { return HasTrue16BitInsts; } + bool hasMadMixInsts() const { return HasMadMixInsts; } @@ -267,7 +271,7 @@ public: /// \p WavefrontSize. AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const; - virtual ~AMDGPUSubtarget() {} + virtual ~AMDGPUSubtarget() = default; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a2c61f9da8da..1c6b9d35695a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUExportClustering.h" +#include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -27,6 +28,7 @@ #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -56,6 +58,7 @@ #include "llvm/Transforms/Vectorize.h" using namespace llvm; +using namespace llvm::PatternMatch; namespace { class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { @@ -269,12 +272,22 @@ static cl::opt<bool> EnableSIModeRegisterPass( cl::init(true), cl::Hidden); +// Enable GFX11+ s_delay_alu insertion +static cl::opt<bool> + EnableInsertDelayAlu("amdgpu-enable-delay-alu", + cl::desc("Enable s_delay_alu insertion"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc")); +static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", + cl::desc("Adjust wave priority"), + cl::init(false), cl::Hidden); + static cl::opt<bool> EnableScalarIRPasses( "amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), @@ -330,7 +343,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); - initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUCtorDtorLoweringPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAttributorPass(*PR); @@ -357,6 +369,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeAMDGPUReleaseVGPRsPass(*PR); + initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIModeRegisterPass(*PR); @@ -390,9 +404,14 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createSchedBarrierDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -400,9 +419,12 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -413,9 +435,12 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C) { + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } @@ -801,6 +826,23 @@ AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { return std::make_pair(nullptr, -1); } +unsigned +AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { + switch (Kind) { + case PseudoSourceValue::Stack: + case PseudoSourceValue::FixedStack: + return AMDGPUAS::PRIVATE_ADDRESS; + case PseudoSourceValue::ConstantPool: + case PseudoSourceValue::GOT: + case PseudoSourceValue::JumpTable: + case PseudoSourceValue::GlobalValueCallEntry: + case PseudoSourceValue::ExternalSymbolCallEntry: + case PseudoSourceValue::TargetCustom: + return AMDGPUAS::CONSTANT_ADDRESS; + } + return AMDGPUAS::FLAT_ADDRESS; +} + //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// @@ -836,7 +878,7 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const { } TargetTransformInfo -GCNTargetMachine::getTargetTransformInfo(const Function &F) { +GCNTargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(GCNTTIImpl(this, F)); } @@ -873,7 +915,11 @@ public: ScheduleDAGMI *DAG = createGenericSchedPostRA(C); const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createSchedBarrierDAGMutation()); return DAG; } @@ -953,10 +999,6 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPUPrintfRuntimeBinding()); addPass(createAMDGPUCtorDtorLoweringPass()); - // This must occur before inlining, as the inliner will not look through - // bitcast calls. - addPass(createAMDGPUFixFunctionBitcastsPass()); - // A call to propagate attributes pass in the backend in case opt was not run. addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); @@ -967,7 +1009,7 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAlwaysInlinerLegacyPass()); // We need to add the barrier noop pass, otherwise adding the function // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two + // one function at a time, which means if we have a module with two // functions, then we will generate code for the first function // without ever running any passes on the second. addPass(createBarrierNoopPass()); @@ -1079,8 +1121,11 @@ bool AMDGPUPassConfig::addGCPasses() { llvm::ScheduleDAGInstrs * AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + if (ST.shouldClusterStores()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -1363,6 +1408,8 @@ void GCNPassConfig::addPreEmitPass() { addPass(&SIInsertHardClausesID); addPass(&SILateBranchLoweringPassID); + if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) + addPass(createAMDGPUSetWavePriorityPass()); if (getOptLevel() > CodeGenOpt::None) addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not @@ -1374,6 +1421,13 @@ void GCNPassConfig::addPreEmitPass() { // Here we add a stand-alone hazard recognizer pass which can handle all // cases. addPass(&PostRAHazardRecognizerID); + + if (getOptLevel() > CodeGenOpt::Less) + addPass(&AMDGPUReleaseVGPRsID); + + if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) + addPass(&AMDGPUInsertDelayAluID); + addPass(&BranchRelaxationPassID); } @@ -1396,7 +1450,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const { const yaml::SIMachineFunctionInfo &YamlMFI = - reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_); + static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); MachineFunction &MF = PFS.MF; SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -1420,6 +1474,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return false; }; + auto parseOptionalRegister = [&](const yaml::StringValue &RegName, + Register &RegVal) { + return !RegName.Value.empty() && parseRegister(RegName, RegVal); + }; + + if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) + return true; + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = @@ -1452,6 +1514,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); } + for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { + Register ParsedReg; + if (parseRegister(YamlReg, ParsedReg)) + return true; + + MFI->reserveWWMRegister(ParsedReg); + } + auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, const TargetRegisterClass &RC, ArgDescriptor &Arg, unsigned UserSGPRs, @@ -1473,7 +1543,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( Arg = ArgDescriptor::createStack(A->StackOffset); // Check and apply the optional mask. if (A->Mask) - Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); + Arg = ArgDescriptor::createArg(Arg, *A->Mask); MFI->NumUserSGPRs += UserSGPRs; MFI->NumSystemSGPRs += SystemSGPRs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index dd3676f3b707..567cc9d610d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// The AMDGPU TargetMachine interface definition for hw codgen targets. +/// The AMDGPU TargetMachine interface definition for hw codegen targets. // //===----------------------------------------------------------------------===// @@ -64,6 +64,8 @@ public: std::pair<const Value *, unsigned> getPredicatedAddrSpace(const Value *V) const override; + + unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; }; //===----------------------------------------------------------------------===// @@ -84,7 +86,7 @@ public: const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override; - TargetTransformInfo getTargetTransformInfo(const Function &F) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; bool useIPRA() const override { return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a8df7789c8a1..a79cd2e9499e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,33 +288,21 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), - IsGraphics(AMDGPU::isGraphics(F.getCallingConv())), - MaxVGPRs(ST->getMaxNumVGPRs( - std::max(ST->getWavesPerEU(F).first, - ST->getWavesPerEUForWorkGroup( - ST->getFlatWorkGroupSizes(F).second)))) { + IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { AMDGPU::SIModeRegisterDefaults Mode(F); HasFP32Denormals = Mode.allFP32Denormals(); HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); } -unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { - // The concept of vector registers doesn't really exist. Some packed vector - // operations operate on the normal 32-bit registers. - return MaxVGPRs; -} +unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { + // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector + // registers. See getRegisterClassForType for the implementation. + // In this case vector registers are not vector in terms of + // VGPRs, but those which can hold multiple values. -unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { // This is really the number of registers to fill when vectorizing / // interleaving loops, so we lie to avoid trying to use all registers. - return getHardwareNumberOfRegisters(Vec) >> 3; -} - -unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { - const SIRegisterInfo *TRI = ST->getRegisterInfo(); - const TargetRegisterClass *RC = TRI->getRegClass(RCID); - unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32; - return getHardwareNumberOfRegisters(false) / NumVGPRs; + return 4; } TypeSize @@ -410,11 +398,14 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, // unaligned access is legal? // // FIXME: This could use fine tuning and microbenchmarks. -Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, - unsigned SrcAddrSpace, - unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const { +Type *GCNTTIImpl::getMemcpyLoopLoweringType( + LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, + unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, + Optional<uint32_t> AtomicElementSize) const { + + if (AtomicElementSize) + return Type::getIntNTy(Context, *AtomicElementSize * 8); + unsigned MinAlign = std::min(SrcAlign, DestAlign); // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the @@ -439,11 +430,17 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, } void GCNTTIImpl::getMemcpyLoopResidualLoweringType( - SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, - unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const { + SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign, + Optional<uint32_t> AtomicCpySize) const { assert(RemainingBytes < 16); + if (AtomicCpySize) + BaseT::getMemcpyLoopResidualLoweringType( + OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign, + DestAlign, AtomicCpySize); + unsigned MinAlign = std::min(SrcAlign, DestAlign); if (MinAlign != 2) { @@ -1042,7 +1039,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef<int> Mask, - int Index, VectorType *SubTp) { + int Index, VectorType *SubTp, + ArrayRef<const Value *> Args) { Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVOP3PInsts()) { if (cast<FixedVectorType>(VT)->getNumElements() == 2 && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index e901b5c5747d..f2260c31e678 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -68,7 +68,6 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { bool IsGraphics; bool HasFP32Denormals; bool HasFP64FP16Denormals; - unsigned MaxVGPRs; static const FeatureBitset InlineFeatureIgnoreList; @@ -113,8 +112,6 @@ public: return TTI::PSK_FastHardware; } - unsigned getHardwareNumberOfRegisters(bool Vector) const; - unsigned getNumberOfRegisters(bool Vector) const; unsigned getNumberOfRegisters(unsigned RCID) const; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const; unsigned getMinVectorRegisterBitWidth() const; @@ -135,15 +132,14 @@ public: unsigned AddrSpace) const; Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, - unsigned SrcAlign, unsigned DestAlign) const; + unsigned SrcAlign, unsigned DestAlign, + Optional<uint32_t> AtomicElementSize) const; - void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut, - LLVMContext &Context, - unsigned RemainingBytes, - unsigned SrcAddrSpace, - unsigned DestAddrSpace, - unsigned SrcAlign, - unsigned DestAlign) const; + void getMemcpyLoopResidualLoweringType( + SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign, + Optional<uint32_t> AtomicCpySize) const; unsigned getMaxInterleaveFactor(unsigned VF); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; @@ -201,7 +197,8 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, int Index, - VectorType *SubTp); + VectorType *SubTp, + ArrayRef<const Value *> Args = None); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h index 654153ea5151..8e5f966b7c6c 100644 --- a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -142,7 +142,7 @@ enum amd_code_property_mask_t { /// is provided to the finalizer when it is invoked and is recorded /// here. The hardware will interleave the memory requests of each /// lane of a wavefront by this element size to ensure each - /// work-item gets a distinct memory memory location. Therefore, the + /// work-item gets a distinct memory location. Therefore, the /// finalizer ensures that all load and store operations done to /// private memory do not exceed this size. For example, if the /// element size is 4 (32-bits or dword) and a 64-bit value must be diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ffe626513d47..e12d0ffef35c 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -20,10 +20,13 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" @@ -33,6 +36,7 @@ #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Casting.h" #include "llvm/Support/MachineValueType.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetParser.h" using namespace llvm; @@ -120,12 +124,6 @@ public: ImmTyD16, ImmTyClampSI, ImmTyOModSI, - ImmTyDPP8, - ImmTyDppCtrl, - ImmTyDppRowMask, - ImmTyDppBankMask, - ImmTyDppBoundCtrl, - ImmTyDppFi, ImmTySdwaDstSel, ImmTySdwaSrc0Sel, ImmTySdwaSrc1Sel, @@ -151,6 +149,12 @@ public: ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, + ImmTyDPP8, + ImmTyDppCtrl, + ImmTyDppRowMask, + ImmTyDppBankMask, + ImmTyDppBoundCtrl, + ImmTyDppFi, ImmTySwizzle, ImmTyGprIdxMode, ImmTyHigh, @@ -158,6 +162,8 @@ public: ImmTyCBSZ, ImmTyABID, ImmTyEndpgm, + ImmTyWaitVDST, + ImmTyWaitEXP, }; enum ImmKindTy { @@ -262,6 +268,14 @@ public: return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32); } + bool isRegOrInlineImmWithInt16InputMods() const { + return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i16); + } + + bool isRegOrInlineImmWithInt32InputMods() const { + return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::i32); + } + bool isRegOrImmWithInt64InputMods() const { return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::i64); } @@ -278,6 +292,15 @@ public: return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64); } + bool isRegOrInlineImmWithFP16InputMods() const { + return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16); + } + + bool isRegOrInlineImmWithFP32InputMods() const { + return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f32); + } + + bool isVReg() const { return isRegClass(AMDGPU::VGPR_32RegClassID) || isRegClass(AMDGPU::VReg_64RegClassID) || @@ -815,6 +838,8 @@ public: } bool isSWaitCnt() const; + bool isDepCtr() const; + bool isSDelayAlu() const; bool isHwreg() const; bool isSendMsg() const; bool isSwizzle() const; @@ -830,6 +855,8 @@ public: bool isS16Imm() const; bool isU16Imm() const; bool isEndpgm() const; + bool isWaitVDST() const; + bool isWaitEXP() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -1037,6 +1064,8 @@ public: case ImmTyCBSZ: OS << "CBSZ"; break; case ImmTyABID: OS << "ABID"; break; case ImmTyEndpgm: OS << "Endpgm"; break; + case ImmTyWaitVDST: OS << "WaitVDST"; break; + case ImmTyWaitEXP: OS << "WaitEXP"; break; } } @@ -1123,7 +1152,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { class KernelScopeInfo { int SgprIndexUnusedMin = -1; int VgprIndexUnusedMin = -1; + int AgprIndexUnusedMin = -1; MCContext *Ctx = nullptr; + MCSubtargetInfo const *MSTI = nullptr; void usesSgprAt(int i) { if (i >= SgprIndexUnusedMin) { @@ -1142,7 +1173,31 @@ class KernelScopeInfo { if (Ctx) { MCSymbol* const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); - Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx)); + int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin, + VgprIndexUnusedMin); + Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx)); + } + } + } + + void usesAgprAt(int i) { + // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction + if (!hasMAIInsts(*MSTI)) + return; + + if (i >= AgprIndexUnusedMin) { + AgprIndexUnusedMin = ++i; + if (Ctx) { + MCSymbol* const Sym = + Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count")); + Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx)); + + // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a) + MCSymbol* const vSym = + Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); + int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin, + VgprIndexUnusedMin); + vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx)); } } } @@ -1152,16 +1207,29 @@ public: void initialize(MCContext &Context) { Ctx = &Context; + MSTI = Ctx->getSubtargetInfo(); + usesSgprAt(SgprIndexUnusedMin = -1); usesVgprAt(VgprIndexUnusedMin = -1); + if (hasMAIInsts(*MSTI)) { + usesAgprAt(AgprIndexUnusedMin = -1); + } } - void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { + void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, + unsigned RegWidth) { switch (RegKind) { - case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break; - case IS_AGPR: // fall through - case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break; - default: break; + case IS_SGPR: + usesSgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1); + break; + case IS_AGPR: + usesAgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1); + break; + case IS_VGPR: + usesVgprAt(DwordRegIndex + divideCeil(RegWidth, 32) - 1); + break; + default: + break; } } }; @@ -1353,10 +1421,15 @@ public: return AMDGPU::isGFX9(getSTI()); } + // TODO: isGFX90A is also true for GFX940. We need to clean it. bool isGFX90A() const { return AMDGPU::isGFX90A(getSTI()); } + bool isGFX940() const { + return AMDGPU::isGFX940(getSTI()); + } + bool isGFX9Plus() const { return AMDGPU::isGFX9Plus(getSTI()); } @@ -1367,6 +1440,14 @@ public: bool isGFX10Plus() const { return AMDGPU::isGFX10Plus(getSTI()); } + bool isGFX11() const { + return AMDGPU::isGFX11(getSTI()); + } + + bool isGFX11Plus() const { + return AMDGPU::isGFX11Plus(getSTI()); + } + bool isGFX10_BEncoding() const { return AMDGPU::isGFX10_BEncoding(getSTI()); } @@ -1496,6 +1577,14 @@ public: bool parseCnt(int64_t &IntVal); OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + + bool parseDepCtr(int64_t &IntVal, unsigned &Mask); + void depCtrError(SMLoc Loc, int ErrorId, StringRef DepCtrName); + OperandMatchResultTy parseDepCtrOps(OperandVector &Operands); + + bool parseDelay(int64_t &Delay); + OperandMatchResultTy parseSDelayAluOps(OperandVector &Operands); + OperandMatchResultTy parseHwreg(OperandVector &Operands); private: @@ -1522,6 +1611,7 @@ private: SMLoc getFlatOffsetLoc(const OperandVector &Operands) const; SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const; + SMLoc getBLGPLoc(const OperandVector &Operands) const; SMLoc getOperandLoc(std::function<bool(const AMDGPUOperand&)> Test, const OperandVector &Operands) const; @@ -1540,7 +1630,7 @@ private: bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMovrels(const MCInst &Inst, const OperandVector &Operands); - bool validateMIMGDataSize(const MCInst &Inst); + Optional<StringRef> validateMIMGDataSize(const MCInst &Inst); bool validateMIMGAddrSize(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); bool validateMIMGDim(const MCInst &Inst); @@ -1553,10 +1643,14 @@ private: bool validateMFMA(const MCInst &Inst, const OperandVector &Operands); bool validateAGPRLdSt(const MCInst &Inst) const; bool validateVGPRAlign(const MCInst &Inst) const; + bool validateBLGP(const MCInst &Inst, const OperandVector &Operands); bool validateGWS(const MCInst &Inst, const OperandVector &Operands); bool validateDivScale(const MCInst &Inst); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); + bool validateFlatLdsDMA(const MCInst &Inst, const OperandVector &Operands, + const SMLoc &IDLoc); + bool validateExeczVcczOperands(const OperandVector &Operands); Optional<StringRef> validateLdsDirect(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); @@ -1586,7 +1680,7 @@ private: bool parseExpr(int64_t &Imm, StringRef Expected = ""); bool parseExpr(OperandVector &Operands); StringRef getTokenStr() const; - AsmToken peekToken(); + AsmToken peekToken(bool ShouldSkipSpace = true); AsmToken getToken() const; SMLoc getLoc() const; void lex(); @@ -1644,10 +1738,12 @@ public: void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtVOPD(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); + void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); void cvtMIMG(MCInst &Inst, const OperandVector &Operands, bool IsAtomic = false); @@ -1668,7 +1764,24 @@ public: AMDGPUOperand::Ptr defaultBoundCtrl() const; AMDGPUOperand::Ptr defaultFI() const; void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); - void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); } + void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { + cvtDPP(Inst, Operands, true); + } + void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands, + bool IsDPP8 = false); + void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) { + cvtVOPCNoDstDPP(Inst, Operands, true); + } + void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, + bool IsDPP8 = false); + void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) { + cvtVOP3DPP(Inst, Operands, true); + } + void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands, + bool IsDPP8 = false); + void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) { + cvtVOPC64NoDstDPP(Inst, Operands, true); + } OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, AMDGPUOperand::ImmTy Type); @@ -1689,6 +1802,10 @@ public: OperandMatchResultTy parseEndpgmOp(OperandVector &Operands); AMDGPUOperand::Ptr defaultEndpgmImmOperands() const; + + AMDGPUOperand::Ptr defaultWaitVDST() const; + AMDGPUOperand::Ptr defaultWaitEXP() const; + OperandMatchResultTy parseVOPD(OperandVector &Operands); }; struct OptionalOperand { @@ -1897,7 +2014,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // We allow fp literals with f16x2 operands assuming that the specified // literal goes into the lower half and the upper half is zero. We also - // require that the literal may be losslesly converted to f16. + // require that the literal may be losslessly converted to f16. MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : (type == MVT::v2i16)? MVT::i16 : (type == MVT::v2f32)? MVT::f32 : type; @@ -2211,52 +2328,86 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { if (Is == IS_VGPR) { switch (RegWidth) { default: return -1; - case 1: return AMDGPU::VGPR_32RegClassID; - case 2: return AMDGPU::VReg_64RegClassID; - case 3: return AMDGPU::VReg_96RegClassID; - case 4: return AMDGPU::VReg_128RegClassID; - case 5: return AMDGPU::VReg_160RegClassID; - case 6: return AMDGPU::VReg_192RegClassID; - case 7: return AMDGPU::VReg_224RegClassID; - case 8: return AMDGPU::VReg_256RegClassID; - case 16: return AMDGPU::VReg_512RegClassID; - case 32: return AMDGPU::VReg_1024RegClassID; + case 32: + return AMDGPU::VGPR_32RegClassID; + case 64: + return AMDGPU::VReg_64RegClassID; + case 96: + return AMDGPU::VReg_96RegClassID; + case 128: + return AMDGPU::VReg_128RegClassID; + case 160: + return AMDGPU::VReg_160RegClassID; + case 192: + return AMDGPU::VReg_192RegClassID; + case 224: + return AMDGPU::VReg_224RegClassID; + case 256: + return AMDGPU::VReg_256RegClassID; + case 512: + return AMDGPU::VReg_512RegClassID; + case 1024: + return AMDGPU::VReg_1024RegClassID; } } else if (Is == IS_TTMP) { switch (RegWidth) { default: return -1; - case 1: return AMDGPU::TTMP_32RegClassID; - case 2: return AMDGPU::TTMP_64RegClassID; - case 4: return AMDGPU::TTMP_128RegClassID; - case 8: return AMDGPU::TTMP_256RegClassID; - case 16: return AMDGPU::TTMP_512RegClassID; + case 32: + return AMDGPU::TTMP_32RegClassID; + case 64: + return AMDGPU::TTMP_64RegClassID; + case 128: + return AMDGPU::TTMP_128RegClassID; + case 256: + return AMDGPU::TTMP_256RegClassID; + case 512: + return AMDGPU::TTMP_512RegClassID; } } else if (Is == IS_SGPR) { switch (RegWidth) { default: return -1; - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 3: return AMDGPU::SGPR_96RegClassID; - case 4: return AMDGPU::SGPR_128RegClassID; - case 5: return AMDGPU::SGPR_160RegClassID; - case 6: return AMDGPU::SGPR_192RegClassID; - case 7: return AMDGPU::SGPR_224RegClassID; - case 8: return AMDGPU::SGPR_256RegClassID; - case 16: return AMDGPU::SGPR_512RegClassID; + case 32: + return AMDGPU::SGPR_32RegClassID; + case 64: + return AMDGPU::SGPR_64RegClassID; + case 96: + return AMDGPU::SGPR_96RegClassID; + case 128: + return AMDGPU::SGPR_128RegClassID; + case 160: + return AMDGPU::SGPR_160RegClassID; + case 192: + return AMDGPU::SGPR_192RegClassID; + case 224: + return AMDGPU::SGPR_224RegClassID; + case 256: + return AMDGPU::SGPR_256RegClassID; + case 512: + return AMDGPU::SGPR_512RegClassID; } } else if (Is == IS_AGPR) { switch (RegWidth) { default: return -1; - case 1: return AMDGPU::AGPR_32RegClassID; - case 2: return AMDGPU::AReg_64RegClassID; - case 3: return AMDGPU::AReg_96RegClassID; - case 4: return AMDGPU::AReg_128RegClassID; - case 5: return AMDGPU::AReg_160RegClassID; - case 6: return AMDGPU::AReg_192RegClassID; - case 7: return AMDGPU::AReg_224RegClassID; - case 8: return AMDGPU::AReg_256RegClassID; - case 16: return AMDGPU::AReg_512RegClassID; - case 32: return AMDGPU::AReg_1024RegClassID; + case 32: + return AMDGPU::AGPR_32RegClassID; + case 64: + return AMDGPU::AReg_64RegClassID; + case 96: + return AMDGPU::AReg_96RegClassID; + case 128: + return AMDGPU::AReg_128RegClassID; + case 160: + return AMDGPU::AReg_160RegClassID; + case 192: + return AMDGPU::AReg_192RegClassID; + case 224: + return AMDGPU::AReg_224RegClassID; + case 256: + return AMDGPU::AReg_256RegClassID; + case 512: + return AMDGPU::AReg_512RegClassID; + case 1024: + return AMDGPU::AReg_1024RegClassID; } } return -1; @@ -2343,32 +2494,32 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, case IS_SPECIAL: if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; - RegWidth = 2; + RegWidth = 64; return true; } if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; - RegWidth = 2; + RegWidth = 64; return true; } if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) { Reg = AMDGPU::XNACK_MASK; - RegWidth = 2; + RegWidth = 64; return true; } if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; - RegWidth = 2; + RegWidth = 64; return true; } if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; - RegWidth = 2; + RegWidth = 64; return true; } if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; - RegWidth = 2; + RegWidth = 64; return true; } Error(Loc, "register does not fit in the list"); @@ -2377,11 +2528,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, case IS_SGPR: case IS_AGPR: case IS_TTMP: - if (Reg1 != Reg + RegWidth) { + if (Reg1 != Reg + RegWidth / 32) { Error(Loc, "registers in a list must have consecutive indices"); return false; } - RegWidth++; + RegWidth += 32; return true; default: llvm_unreachable("unexpected register kind"); @@ -2470,7 +2621,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, if (RegKind == IS_SGPR || RegKind == IS_TTMP) { // SGPR and TTMP registers must be aligned. // Max required alignment is 4 dwords. - AlignSize = std::min(RegWidth, 4u); + AlignSize = std::min(RegWidth / 32, 4u); } if (RegNum % AlignSize != 0) { @@ -2495,8 +2646,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, return RC.getRegister(RegIdx); } -bool -AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { +bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) { int64_t RegLo, RegHi; if (!skipToken(AsmToken::LBrac, "missing register index")) return false; @@ -2534,7 +2684,7 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { } Num = static_cast<unsigned>(RegLo); - Width = (RegHi - RegLo) + 1; + RegWidth = 32 * ((RegHi - RegLo) + 1); return true; } @@ -2545,7 +2695,7 @@ unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, unsigned Reg = getSpecialRegForName(getTokenStr()); if (Reg) { RegNum = 0; - RegWidth = 1; + RegWidth = 32; RegKind = IS_SPECIAL; Tokens.push_back(getToken()); lex(); // skip register name @@ -2577,7 +2727,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, Error(Loc, "invalid register index"); return AMDGPU::NoRegister; } - RegWidth = 1; + RegWidth = 32; } else { // Range of registers: v[XX:YY]. ":YY" is optional. if (!ParseRegRange(RegNum, RegWidth)) @@ -2603,7 +2753,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, auto Loc = getLoc(); if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) return AMDGPU::NoRegister; - if (RegWidth != 1) { + if (RegWidth != 32) { Error(Loc, "expected a single 32-bit register"); return AMDGPU::NoRegister; } @@ -2618,7 +2768,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, Tokens)) { return AMDGPU::NoRegister; } - if (NextRegWidth != 1) { + if (NextRegWidth != 32) { Error(Loc, "expected a single 32-bit register"); return AMDGPU::NoRegister; } @@ -2721,7 +2871,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind, return true; MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName); - int64_t NewMax = DwordRegIndex + RegWidth - 1; + int64_t NewMax = DwordRegIndex + divideCeil(RegWidth, 32) - 1; int64_t OldCount; if (!Sym->isVariable()) @@ -2761,7 +2911,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { // TODO: add syntactic sugar for 1/(2*PI) - assert(!isRegister()); + if (isRegister()) + return MatchOperand_NoMatch; assert(!isModifier()); const auto& Tok = getToken(); @@ -2927,7 +3078,7 @@ AMDGPUAsmParser::isModifier() { // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 // Negative fp literals with preceding "-" are -// handled likewise for unifomtity +// handled likewise for uniformity // bool AMDGPUAsmParser::parseSP3NegModifier() { @@ -3110,7 +3261,8 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { static ArrayRef<unsigned> getAllVariants() { static const unsigned Variants[] = { AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, + AMDGPUAsmVariants::DPP, AMDGPUAsmVariants::VOP3_DPP }; return makeArrayRef(Variants); @@ -3118,6 +3270,10 @@ static ArrayRef<unsigned> getAllVariants() { // What asm variants we should check ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { + if (isForcedDPP() && isForcedVOP3()) { + static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3_DPP}; + return makeArrayRef(Variants); + } if (getForcedEncodingSize() == 32) { static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT}; return makeArrayRef(Variants); @@ -3143,6 +3299,9 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { } StringRef AMDGPUAsmParser::getMatchedVariantName() const { + if (isForcedDPP() && isForcedVOP3()) + return "e64_dpp"; + if (getForcedEncodingSize() == 32) return "e32"; @@ -3231,10 +3390,13 @@ unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const { // 64-bit shift instructions can use only one scalar value input case AMDGPU::V_LSHLREV_B64_e64: case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_e64_gfx11: case AMDGPU::V_LSHRREV_B64_e64: case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_e64_gfx11: case AMDGPU::V_ASHRREV_I64_e64: case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_e64_gfx11: case AMDGPU::V_LSHL_B64_e64: case AMDGPU::V_LSHR_B64_e64: case AMDGPU::V_ASHR_I64_e64: @@ -3305,8 +3467,7 @@ AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst, // flat_scratch_lo, flat_scratch_hi // are theoretically valid but they are disabled anyway. // Note that this code mimics SIInstrInfo::verifyInstruction - if (!SGPRsUsed.count(LastSGPR)) { - SGPRsUsed.insert(LastSGPR); + if (SGPRsUsed.insert(LastSGPR).second) { ++ConstantBusUseCount; } } else { // Expression or a literal @@ -3369,7 +3530,6 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst, assert(DstIdx != -1); const MCOperand &Dst = Inst.getOperand(DstIdx); assert(Dst.isReg()); - const unsigned DstReg = mc2PseudoReg(Dst.getReg()); const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx }; @@ -3377,8 +3537,8 @@ AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst, if (SrcIdx == -1) break; const MCOperand &Src = Inst.getOperand(SrcIdx); if (Src.isReg()) { - const unsigned SrcReg = mc2PseudoReg(Src.getReg()); - if (isRegIntersect(DstReg, SrcReg, TRI)) { + if (TRI->regsOverlap(Dst.getReg(), Src.getReg())) { + const unsigned SrcReg = mc2PseudoReg(Src.getReg()); Error(getRegLoc(SrcReg, Operands), "destination must be different than all sources"); return false; @@ -3403,13 +3563,13 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) { return true; } -bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { +Optional<StringRef> AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) - return true; + return None; int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask); @@ -3418,7 +3578,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { assert(VDataIdx != -1); if (DMaskIdx == -1 || TFEIdx == -1) // intersect_ray - return true; + return None; unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; @@ -3426,15 +3586,22 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { if (DMask == 0) DMask = 1; + bool isPackedD16 = false; unsigned DataSize = (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask); if (hasPackedD16()) { int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16); - if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) + isPackedD16 = D16Idx >= 0; + if (isPackedD16 && Inst.getOperand(D16Idx).getImm()) DataSize = (DataSize + 1) / 2; } - return (VDataSize / 4) == DataSize + TFESize; + if ((VDataSize / 4) == DataSize + TFESize) + return None; + + return StringRef(isPackedD16 + ? "image data size does not match dmask, d16 and tfe" + : "image data size does not match dmask and tfe"); } bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { @@ -3607,7 +3774,7 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst, auto Reg = mc2PseudoReg(Src0.getReg()); const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - if (isSGPR(Reg, TRI)) { + if (!isGFX90A() && isSGPR(Reg, TRI)) { Error(getRegLoc(Reg, Operands), "source operand must be either a VGPR or an inline constant"); return false; @@ -3641,7 +3808,7 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128) return true; - if (isRegIntersect(Src2Reg, DstReg, TRI)) { + if (TRI->regsOverlap(Src2Reg, DstReg)) { Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands), "source 2 operand must not partially overlap with dst"); return false; @@ -3861,7 +4028,7 @@ Optional<StringRef> AMDGPUAsmParser::validateLdsDirect(const MCInst &Inst) { const auto &Src = Inst.getOperand(SrcIdx); if (Src.isReg() && Src.getReg() == LDS_DIRECT) { - if (isGFX90A()) + if (isGFX90A() || isGFX11Plus()) return StringRef("lds_direct is not supported on this GPU"); if (IsRevOpcode(Opcode) || (Desc.TSFlags & SIInstrFlags::SDWA)) @@ -4009,6 +4176,20 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { if (OpSel & ~3) return false; } + + if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) { + if (Inst.getOperand(OpSelIdx).getImm() != 0) + return false; + } + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + if (OpSelHiIdx != -1) { + if (Inst.getOperand(OpSelHiIdx).getImm() != -1) + return false; + } + } + return true; } @@ -4179,6 +4360,47 @@ bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { return true; } +SMLoc AMDGPUAsmParser::getBLGPLoc(const OperandVector &Operands) const { + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (Op.isBLGP()) + return Op.getStartLoc(); + } + return SMLoc(); +} + +bool AMDGPUAsmParser::validateBLGP(const MCInst &Inst, + const OperandVector &Operands) { + unsigned Opc = Inst.getOpcode(); + int BlgpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp); + if (BlgpIdx == -1) + return true; + SMLoc BLGPLoc = getBLGPLoc(Operands); + if (!BLGPLoc.isValid()) + return true; + bool IsNeg = StringRef(BLGPLoc.getPointer()).startswith("neg:"); + auto FB = getFeatureBits(); + bool UsesNeg = false; + if (FB[AMDGPU::FeatureGFX940Insts]) { + switch (Opc) { + case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd: + case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd: + case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd: + case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd: + UsesNeg = true; + } + } + + if (IsNeg == UsesNeg) + return true; + + Error(BLGPLoc, + UsesNeg ? "invalid modifier: blgp is not supported" + : "invalid modifier: neg is not supported"); + + return false; +} + // gfx90a has an undocumented limitation: // DS_GWS opcodes must use even aligned registers. bool AMDGPUAsmParser::validateGWS(const MCInst &Inst, @@ -4218,13 +4440,19 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - if ((TSFlags & (SIInstrFlags::SMRD)) && - (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC))) { - Error(IDLoc, "invalid cache policy for SMRD instruction"); - return false; + if (TSFlags & SIInstrFlags::SMRD) { + if (CPol && (isSI() || isCI())) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + Error(S, "cache policy is not supported for SMRD instructions"); + return false; + } + if (CPol & ~(AMDGPU::CPol::GLC | AMDGPU::CPol::DLC)) { + Error(IDLoc, "invalid cache policy for SMEM instruction"); + return false; + } } - if (isGFX90A() && (CPol & CPol::SCC)) { + if (isGFX90A() && !isGFX940() && (CPol & CPol::SCC)) { SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); StringRef CStr(S.getPointer()); S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]); @@ -4237,15 +4465,18 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, if (TSFlags & SIInstrFlags::IsAtomicRet) { if (!(TSFlags & SIInstrFlags::MIMG) && !(CPol & CPol::GLC)) { - Error(IDLoc, "instruction must use glc"); + Error(IDLoc, isGFX940() ? "instruction must use sc0" + : "instruction must use glc"); return false; } } else { if (CPol & CPol::GLC) { SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); StringRef CStr(S.getPointer()); - S = SMLoc::getFromPointer(&CStr.data()[CStr.find("glc")]); - Error(S, "instruction must not use glc"); + S = SMLoc::getFromPointer( + &CStr.data()[CStr.find(isGFX940() ? "sc0" : "glc")]); + Error(S, isGFX940() ? "instruction must not use sc0" + : "instruction must not use glc"); return false; } } @@ -4253,6 +4484,47 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateFlatLdsDMA(const MCInst &Inst, + const OperandVector &Operands, + const SMLoc &IDLoc) { + if (isGFX940()) + return true; + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & (SIInstrFlags::VALU | SIInstrFlags::FLAT)) != + (SIInstrFlags::VALU | SIInstrFlags::FLAT)) + return true; + // This is FLAT LDS DMA. + + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyLDS, Operands); + StringRef CStr(S.getPointer()); + if (!CStr.startswith("lds")) { + // This is incorrectly selected LDS DMA version of a FLAT load opcode. + // And LDS version should have 'lds' modifier, but it follows optional + // operands so its absense is ignored by the matcher. + Error(IDLoc, "invalid operands for instruction"); + return false; + } + + return true; +} + +bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) { + if (!isGFX11Plus()) + return true; + for (auto &Operand : Operands) { + if (!Operand->isReg()) + continue; + unsigned Reg = Operand->getReg(); + if (Reg == SRC_EXECZ || Reg == SRC_VCCZ) { + Error(getRegLoc(Reg, Operands), + "execz and vccz are not supported on this GPU"); + return false; + } + } + return true; +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands) { @@ -4302,9 +4574,8 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "invalid dim; must be MSAA type"); return false; } - if (!validateMIMGDataSize(Inst)) { - Error(IDLoc, - "image data size does not match dmask and tfe"); + if (auto ErrMsg = validateMIMGDataSize(Inst)) { + Error(IDLoc, *ErrMsg); return false; } if (!validateMIMGAddrSize(Inst)) { @@ -4357,6 +4628,10 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, return false; } + if (!validateBLGP(Inst, Operands)) { + return false; + } + if (!validateDivScale(Inst)) { Error(IDLoc, "ABS not allowed in VOP3B instructions"); return false; @@ -4364,6 +4639,13 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateCoherencyBits(Inst, Operands, IDLoc)) { return false; } + if (!validateExeczVcczOperands(Operands)) { + return false; + } + + if (!validateFlatLdsDMA(Inst, Operands, IDLoc)) { + return false; + } return true; } @@ -4606,6 +4888,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { SMRange VGPRRange; uint64_t NextFreeVGPR = 0; uint64_t AccumOffset = 0; + uint64_t SharedVGPRCount = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; @@ -4630,9 +4913,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (ID == ".end_amdhsa_kernel") break; - if (Seen.find(ID) != Seen.end()) + if (!Seen.insert(ID).second) return TokError(".amdhsa_ directives cannot be repeated"); - Seen.insert(ID); SMLoc ValStart = getLoc(); int64_t IVal; @@ -4833,6 +5115,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { return Error(IDRange.Start, "directive requires gfx10+", IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val, ValRange); + } else if (ID == ".amdhsa_shared_vgpr_count") { + if (IVersion.Major < 10) + return Error(IDRange.Start, "directive requires gfx10+", IDRange); + SharedVGPRCount = Val; + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, + COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val, + ValRange); } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -4922,6 +5211,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { (AccumOffset / 4 - 1)); } + if (IVersion.Major == 10) { + // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS + if (SharedVGPRCount && EnableWavefrontSize32) { + return TokError("shared_vgpr_count directive not valid on " + "wavefront size 32"); + } + if (SharedVGPRCount * 2 + VGPRBlocks > 63) { + return TokError("shared_vgpr_count*2 + " + "compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot " + "exceed 63\n"); + } + } + getTargetStreamer().EmitAmdhsaKernelDescriptor( getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, ReserveFlatScr); @@ -5253,8 +5555,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { return Error(AlignLoc, "alignment is too large"); } - if (parseToken(AsmToken::EndOfStatement, - "unexpected token in '.amdgpu_lds' directive")) + if (parseEOL()) return true; Symbol->redefineIfPossible(); @@ -5313,26 +5614,21 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) { - for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true); - R.isValid(); ++R) { - if (*R == RegNo) - return isGFX9Plus(); - } + if (MRI.regsOverlap(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, RegNo)) + return isGFX9Plus(); - // GFX10 has 2 more SGPRs 104 and 105. - for (MCRegAliasIterator R(AMDGPU::SGPR104_SGPR105, &MRI, true); - R.isValid(); ++R) { - if (*R == RegNo) - return hasSGPR104_SGPR105(); - } + // GFX10+ has 2 more SGPRs 104 and 105. + if (MRI.regsOverlap(AMDGPU::SGPR104_SGPR105, RegNo)) + return hasSGPR104_SGPR105(); switch (RegNo) { case AMDGPU::SRC_SHARED_BASE: case AMDGPU::SRC_SHARED_LIMIT: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: return isGFX9Plus(); + case AMDGPU::SRC_POPS_EXITING_WAVE_ID: + return isGFX9Plus() && !isGFX11Plus(); case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -5355,7 +5651,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, if (isSI() || isGFX10Plus()) { // No flat_scr on SI. - // On GFX10 flat scratch is not a valid register operand and can only be + // On GFX10Plus flat scratch is not a valid register operand and can only be // accessed with s_setreg/s_getreg. switch (RegNo) { case AMDGPU::FLAT_SCR: @@ -5369,11 +5665,8 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that // SI/CI have. - for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); - R.isValid(); ++R) { - if (*R == RegNo) - return hasSGPR102_SGPR103(); - } + if (MRI.regsOverlap(AMDGPU::SGPR102_SGPR103, RegNo)) + return hasSGPR102_SGPR103(); return true; } @@ -5381,8 +5674,13 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, OperandMode Mode) { + OperandMatchResultTy ResTy = parseVOPD(Operands); + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || + isToken(AsmToken::EndOfStatement)) + return ResTy; + // Try to parse with a custom parser - OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + ResTy = MatchOperandParserImpl(Operands, Mnemonic); // If we successfully parsed the operand or if there as an error parsing, // we are done. @@ -5435,7 +5733,11 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { setForcedDPP(false); setForcedSDWA(false); - if (Name.endswith("_e64")) { + if (Name.endswith("_e64_dpp")) { + setForcedDPP(true); + setForcedEncodingSize(64); + return Name.substr(0, Name.size() - 8); + } else if (Name.endswith("_e64")) { setForcedEncodingSize(64); return Name.substr(0, Name.size() - 4); } else if (Name.endswith("_e32")) { @@ -5451,11 +5753,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { return Name; } +static void applyMnemonicAliases(StringRef &Mnemonic, + const FeatureBitset &Features, + unsigned VariantID); + bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { // Add the instruction mnemonic Name = parseMnemonicSuffix(Name); + + // If the target architecture uses MnemonicAlias, call it here to parse + // operands correctly. + applyMnemonicAliases(Name, getAvailableFeatures(), 0); + Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc)); bool IsMIMG = Name.startswith("image_"); @@ -5603,7 +5914,24 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) { unsigned CPolOff = 0; SMLoc S = getLoc(); - if (trySkipId("glc")) + StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken(); + if (isGFX940() && !Mnemo.startswith("s_")) { + if (trySkipId("sc0")) + CPolOn = AMDGPU::CPol::SC0; + else if (trySkipId("nosc0")) + CPolOff = AMDGPU::CPol::SC0; + else if (trySkipId("nt")) + CPolOn = AMDGPU::CPol::NT; + else if (trySkipId("nont")) + CPolOff = AMDGPU::CPol::NT; + else if (trySkipId("sc1")) + CPolOn = AMDGPU::CPol::SC1; + else if (trySkipId("nosc1")) + CPolOff = AMDGPU::CPol::SC1; + else + return MatchOperand_NoMatch; + } + else if (trySkipId("glc")) CPolOn = AMDGPU::CPol::GLC; else if (trySkipId("noglc")) CPolOff = AMDGPU::CPol::GLC; @@ -5809,7 +6137,7 @@ AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; if (isGFX10Plus()) { - auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt); + auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt, getSTI()); if (Ufmt == UFMT_UNDEF) { Error(FormatLoc, "unsupported format"); return MatchOperand_ParseFail; @@ -5828,7 +6156,7 @@ AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr, int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; - auto Id = getUnifiedFormat(FormatStr); + auto Id = getUnifiedFormat(FormatStr, getSTI()); if (Id == UFMT_UNDEF) return MatchOperand_NoMatch; @@ -5969,6 +6297,7 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, bool IsGdsHardcoded) { OptionalImmIndexMap OptionalIdx; + AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -5986,13 +6315,10 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, // Handle optional arguments OptionalIdx[Op.getImmTy()] = i; - } - AMDGPUOperand::ImmTy OffsetType = - (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx10 || - Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_gfx6_gfx7 || - Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : - AMDGPUOperand::ImmTyOffset; + if (Op.getImmTy() == AMDGPUOperand::ImmTySwizzle) + OffsetType = AMDGPUOperand::ImmTySwizzle; + } addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); @@ -6034,7 +6360,7 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { continue; } - if (Op.isToken() && Op.getToken() == "done") + if (Op.isToken() && (Op.getToken() == "done" || Op.getToken() == "row_en")) continue; // Handle optional arguments @@ -6157,11 +6483,179 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { return MatchOperand_Success; } +bool AMDGPUAsmParser::parseDelay(int64_t &Delay) { + SMLoc FieldLoc = getLoc(); + StringRef FieldName = getTokenStr(); + if (!skipToken(AsmToken::Identifier, "expected a field name") || + !skipToken(AsmToken::LParen, "expected a left parenthesis")) + return false; + + SMLoc ValueLoc = getLoc(); + StringRef ValueName = getTokenStr(); + if (!skipToken(AsmToken::Identifier, "expected a value name") || + !skipToken(AsmToken::RParen, "expected a right parenthesis")) + return false; + + unsigned Shift; + if (FieldName == "instid0") { + Shift = 0; + } else if (FieldName == "instskip") { + Shift = 4; + } else if (FieldName == "instid1") { + Shift = 7; + } else { + Error(FieldLoc, "invalid field name " + FieldName); + return false; + } + + int Value; + if (Shift == 4) { + // Parse values for instskip. + Value = StringSwitch<int>(ValueName) + .Case("SAME", 0) + .Case("NEXT", 1) + .Case("SKIP_1", 2) + .Case("SKIP_2", 3) + .Case("SKIP_3", 4) + .Case("SKIP_4", 5) + .Default(-1); + } else { + // Parse values for instid0 and instid1. + Value = StringSwitch<int>(ValueName) + .Case("NO_DEP", 0) + .Case("VALU_DEP_1", 1) + .Case("VALU_DEP_2", 2) + .Case("VALU_DEP_3", 3) + .Case("VALU_DEP_4", 4) + .Case("TRANS32_DEP_1", 5) + .Case("TRANS32_DEP_2", 6) + .Case("TRANS32_DEP_3", 7) + .Case("FMA_ACCUM_CYCLE_1", 8) + .Case("SALU_CYCLE_1", 9) + .Case("SALU_CYCLE_2", 10) + .Case("SALU_CYCLE_3", 11) + .Default(-1); + } + if (Value < 0) { + Error(ValueLoc, "invalid value name " + ValueName); + return false; + } + + Delay |= Value << Shift; + return true; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSDelayAluOps(OperandVector &Operands) { + int64_t Delay = 0; + SMLoc S = getLoc(); + + if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { + do { + if (!parseDelay(Delay)) + return MatchOperand_ParseFail; + } while (trySkipToken(AsmToken::Pipe)); + } else { + if (!parseExpr(Delay)) + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Delay, S)); + return MatchOperand_Success; +} + bool AMDGPUOperand::isSWaitCnt() const { return isImm(); } +bool AMDGPUOperand::isSDelayAlu() const { return isImm(); } + +//===----------------------------------------------------------------------===// +// DepCtr +//===----------------------------------------------------------------------===// + +void AMDGPUAsmParser::depCtrError(SMLoc Loc, int ErrorId, + StringRef DepCtrName) { + switch (ErrorId) { + case OPR_ID_UNKNOWN: + Error(Loc, Twine("invalid counter name ", DepCtrName)); + return; + case OPR_ID_UNSUPPORTED: + Error(Loc, Twine(DepCtrName, " is not supported on this GPU")); + return; + case OPR_ID_DUPLICATE: + Error(Loc, Twine("duplicate counter name ", DepCtrName)); + return; + case OPR_VAL_INVALID: + Error(Loc, Twine("invalid value for ", DepCtrName)); + return; + default: + assert(false); + } +} + +bool AMDGPUAsmParser::parseDepCtr(int64_t &DepCtr, unsigned &UsedOprMask) { + + using namespace llvm::AMDGPU::DepCtr; + + SMLoc DepCtrLoc = getLoc(); + StringRef DepCtrName = getTokenStr(); + + if (!skipToken(AsmToken::Identifier, "expected a counter name") || + !skipToken(AsmToken::LParen, "expected a left parenthesis")) + return false; + + int64_t ExprVal; + if (!parseExpr(ExprVal)) + return false; + + unsigned PrevOprMask = UsedOprMask; + int CntVal = encodeDepCtr(DepCtrName, ExprVal, UsedOprMask, getSTI()); + + if (CntVal < 0) { + depCtrError(DepCtrLoc, CntVal, DepCtrName); + return false; + } + + if (!skipToken(AsmToken::RParen, "expected a closing parenthesis")) + return false; + + if (trySkipToken(AsmToken::Amp) || trySkipToken(AsmToken::Comma)) { + if (isToken(AsmToken::EndOfStatement)) { + Error(getLoc(), "expected a counter name"); + return false; + } + } + + unsigned CntValMask = PrevOprMask ^ UsedOprMask; + DepCtr = (DepCtr & ~CntValMask) | CntVal; + return true; +} + +OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) { + using namespace llvm::AMDGPU::DepCtr; + + int64_t DepCtr = getDefaultDepCtrEncoding(getSTI()); + SMLoc Loc = getLoc(); + + if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { + unsigned UsedOprMask = 0; + while (!isToken(AsmToken::EndOfStatement)) { + if (!parseDepCtr(DepCtr, UsedOprMask)) + return MatchOperand_ParseFail; + } + } else { + if (!parseExpr(DepCtr)) + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, DepCtr, Loc)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); } + //===----------------------------------------------------------------------===// // hwreg //===----------------------------------------------------------------------===// @@ -6175,7 +6669,7 @@ AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg, // The register may be specified by name or using a numeric code HwReg.Loc = getLoc(); if (isToken(AsmToken::Identifier) && - (HwReg.Id = getHwregId(getTokenStr())) >= 0) { + (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) { HwReg.IsSymbolic = true; lex(); // skip register name } else if (!parseExpr(HwReg.Id, "a register name")) { @@ -6208,15 +6702,18 @@ AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg, using namespace llvm::AMDGPU::Hwreg; - if (HwReg.IsSymbolic && !isValidHwreg(HwReg.Id, getSTI())) { - Error(HwReg.Loc, - "specified hardware register is not supported on this GPU"); - return false; - } - if (!isValidHwreg(HwReg.Id)) { - Error(HwReg.Loc, - "invalid code of hardware register: only 6-bit values are legal"); - return false; + if (HwReg.IsSymbolic) { + if (HwReg.Id == OPR_ID_UNSUPPORTED) { + Error(HwReg.Loc, + "specified hardware register is not supported on this GPU"); + return false; + } + } else { + if (!isValidHwreg(HwReg.Id)) { + Error(HwReg.Loc, + "invalid code of hardware register: only 6-bit values are legal"); + return false; + } } if (!isValidHwregOffset(Offset.Id)) { Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal"); @@ -6238,7 +6735,7 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { SMLoc Loc = getLoc(); if (trySkipId("hwreg", AsmToken::LParen)) { - OperandInfoTy HwReg(ID_UNKNOWN_); + OperandInfoTy HwReg(OPR_ID_UNKNOWN); OperandInfoTy Offset(OFFSET_DEFAULT_); OperandInfoTy Width(WIDTH_DEFAULT_); if (parseHwregBody(HwReg, Offset, Width) && @@ -6275,7 +6772,8 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg, using namespace llvm::AMDGPU::SendMsg; Msg.Loc = getLoc(); - if (isToken(AsmToken::Identifier) && (Msg.Id = getMsgId(getTokenStr())) >= 0) { + if (isToken(AsmToken::Identifier) && + (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) { Msg.IsSymbolic = true; lex(); // skip message name } else if (!parseExpr(Msg.Id, "a message name")) { @@ -6310,15 +6808,22 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, using namespace llvm::AMDGPU::SendMsg; // Validation strictness depends on whether message is specified - // in a symbolc or in a numeric form. In the latter case + // in a symbolic or in a numeric form. In the latter case // only encoding possibility is checked. bool Strict = Msg.IsSymbolic; - if (!isValidMsgId(Msg.Id, getSTI(), Strict)) { - Error(Msg.Loc, "invalid message id"); - return false; + if (Strict) { + if (Msg.Id == OPR_ID_UNSUPPORTED) { + Error(Msg.Loc, "specified message id is not supported on this GPU"); + return false; + } + } else { + if (!isValidMsgId(Msg.Id, getSTI())) { + Error(Msg.Loc, "invalid message id"); + return false; + } } - if (Strict && (msgRequiresOp(Msg.Id) != Op.IsDefined)) { + if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) { if (Op.IsDefined) { Error(Op.Loc, "message does not support operations"); } else { @@ -6330,7 +6835,8 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, Error(Op.Loc, "invalid operation id"); return false; } - if (Strict && !msgSupportsStream(Msg.Id, Op.Id) && Stream.IsDefined) { + if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) && + Stream.IsDefined) { Error(Stream.Loc, "message operation does not support streams"); return false; } @@ -6349,7 +6855,7 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { SMLoc Loc = getLoc(); if (trySkipId("sendmsg", AsmToken::LParen)) { - OperandInfoTy Msg(ID_UNKNOWN_); + OperandInfoTy Msg(OPR_ID_UNKNOWN); OperandInfoTy Op(OP_NONE_); OperandInfoTy Stream(STREAM_ID_NONE_); if (parseSendMsgBody(Msg, Op, Stream) && @@ -6610,9 +7116,10 @@ AMDGPUAsmParser::getToken() const { return Parser.getTok(); } -AsmToken -AMDGPUAsmParser::peekToken() { - return isToken(AsmToken::EndOfStatement) ? getToken() : getLexer().peekTok(); +AsmToken AMDGPUAsmParser::peekToken(bool ShouldSkipSpace) { + return isToken(AsmToken::EndOfStatement) + ? getToken() + : getLexer().peekTok(ShouldSkipSpace); } void @@ -7078,8 +7585,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsLds) { - bool IsLdsOpcode = IsLds; - bool HasLdsModifier = false; OptionalImmIndexMap OptionalIdx; unsigned FirstOperandIdx = 1; bool IsAtomicReturn = false; @@ -7123,8 +7628,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, continue; } - HasLdsModifier |= Op.isLDS(); - // Handle tokens like 'offen' which are sometimes hard-coded into the // asm string. There are no MCInst operands for these. if (Op.isToken()) { @@ -7136,25 +7639,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, OptionalIdx[Op.getImmTy()] = i; } - // This is a workaround for an llvm quirk which may result in an - // incorrect instruction selection. Lds and non-lds versions of - // MUBUF instructions are identical except that lds versions - // have mandatory 'lds' modifier. However this modifier follows - // optional modifiers and llvm asm matcher regards this 'lds' - // modifier as an optional one. As a result, an lds version - // of opcode may be selected even if it has no 'lds' modifier. - if (IsLdsOpcode && !HasLdsModifier) { - int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode()); - if (NoLdsOpcode != -1) { // Got lds version - correct it. - Inst.setOpcode(NoLdsOpcode); - IsLdsOpcode = false; - } - } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); - if (!IsLdsOpcode) { // tfe is not legal with lds opcodes + if (!IsLds) { // tfe is not legal with lds opcodes addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); @@ -7327,7 +7815,8 @@ bool AMDGPUOperand::isSMRDOffset8() const { } bool AMDGPUOperand::isSMEMOffset() const { - return isImm(); // Offset range is checked later by validator. + return isImmTy(ImmTyNone) || + isImmTy(ImmTyOffset); // Offset range is checked later by validator. } bool AMDGPUOperand::isSMRDLiteralOffset() const { @@ -7415,10 +7904,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, {"dim", AMDGPUOperand::ImmTyDim, false, nullptr}, - {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, - {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, - {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, - {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr}, {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, @@ -7429,9 +7914,17 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr}, {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}, + {"dpp8", AMDGPUOperand::ImmTyDPP8, false, nullptr}, + {"dpp_ctrl", AMDGPUOperand::ImmTyDppCtrl, false, nullptr}, + {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, + {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, + {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"fi", AMDGPUOperand::ImmTyDppFi, false, nullptr}, {"blgp", AMDGPUOperand::ImmTyBLGP, false, nullptr}, {"cbsz", AMDGPUOperand::ImmTyCBSZ, false, nullptr}, - {"abid", AMDGPUOperand::ImmTyABID, false, nullptr} + {"abid", AMDGPUOperand::ImmTyABID, false, nullptr}, + {"wait_vdst", AMDGPUOperand::ImmTyWaitVDST, false, nullptr}, + {"wait_exp", AMDGPUOperand::ImmTyWaitEXP, false, nullptr} }; void AMDGPUAsmParser::onBeginOfFile() { @@ -7497,8 +7990,17 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) res = parseDim(Operands); } else if (Op.Type == AMDGPUOperand::ImmTyCPol) { res = parseCPol(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyDPP8) { + res = parseDPP8(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTyDppCtrl) { + res = parseDPPCtrl(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); + if (Op.Type == AMDGPUOperand::ImmTyBLGP && res == MatchOperand_NoMatch) { + res = parseOperandArrayWithPrefix("neg", Operands, + AMDGPUOperand::ImmTyBLGP, + nullptr); + } } if (res != MatchOperand_NoMatch) { return res; @@ -7596,6 +8098,66 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) } } +void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) +{ + OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("unhandled operand type"); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel); + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyWaitEXP); + + if (OpSelIdx == -1) + return; + + const int Ops[] = { AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 }; + const int ModOps[] = { AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers }; + + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]); + if (OpIdx == -1) + break; + + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + + if ((OpSel & (1 << J)) != 0) + ModVal |= SISrcMods::OP_SEL_0; + if (ModOps[J] == AMDGPU::OpName::src0_modifiers && + (OpSel & (1 << 3)) != 0) + ModVal |= SISrcMods::DST_OP_SEL; + + Inst.getOperand(ModIdx).setImm(ModVal); + } +} + void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { unsigned Opc = Inst.getOpcode(); @@ -7652,9 +8214,12 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_MAC_F16_e64_vi || Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || + Opc == AMDGPU::V_FMAC_F32_e64_gfx11 || Opc == AMDGPU::V_FMAC_F32_e64_vi || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || - Opc == AMDGPU::V_FMAC_F16_e64_gfx10) { + Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 || + Opc == AMDGPU::V_FMAC_F16_e64_gfx10 || + Opc == AMDGPU::V_FMAC_F16_e64_gfx11) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 @@ -7731,6 +8296,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if (OpIdx == -1) break; + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + + if (ModIdx == -1) + continue; + uint32_t ModVal = 0; if ((OpSel & (1 << J)) != 0) @@ -7745,8 +8315,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if ((NegHi & (1 << J)) != 0) ModVal |= SISrcMods::NEG_HI; - int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal); } } @@ -7758,6 +8326,118 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { } //===----------------------------------------------------------------------===// +// VOPD +//===----------------------------------------------------------------------===// + +OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) { + if (!hasVOPD(getSTI())) + return MatchOperand_NoMatch; + + if (isToken(AsmToken::Colon) && peekToken(false).is(AsmToken::Colon)) { + SMLoc S = getLoc(); + lex(); + lex(); + Operands.push_back(AMDGPUOperand::CreateToken(this, "::", S)); + const MCExpr *Expr; + if (isToken(AsmToken::Identifier) && !Parser.parseExpression(Expr)) { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + return MatchOperand_Success; + } + Error(S, "invalid VOPD :: usage"); + return MatchOperand_ParseFail; + } + return MatchOperand_NoMatch; +} + +// Create VOPD MCInst operands using parsed assembler operands. +// Parsed VOPD operands are ordered as follows: +// OpXMnemo dstX src0X [vsrc1X|imm vsrc1X|vsrc1X imm] '::' +// OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm] +// If both OpX and OpY have an imm, the first imm has a different name: +// OpXMnemo dstX src0X [vsrc1X|immDeferred vsrc1X|vsrc1X immDeferred] '::' +// OpYMnemo dstY src0Y [vsrc1Y|imm vsrc1Y|vsrc1Y imm] +// MCInst operands have the following order: +// dstX, dstY, src0X [, other OpX operands], src0Y [, other OpY operands] +void AMDGPUAsmParser::cvtVOPD(MCInst &Inst, const OperandVector &Operands) { + auto addOp = [&](uint16_t i) { // NOLINT:function pointer + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + return; + } + if (Op.isImm()) { + Op.addImmOperands(Inst, 1); + return; + } + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + return; + } + llvm_unreachable("Unhandled operand type in cvtVOPD"); + }; + + // Indices into MCInst.Operands + const auto FmamkOpXImmMCIndex = 3; // dstX, dstY, src0X, imm, ... + const auto FmaakOpXImmMCIndex = 4; // dstX, dstY, src0X, src1X, imm, ... + const auto MinOpYImmMCIndex = 4; // dstX, dstY, src0X, src0Y, imm, ... + + unsigned Opc = Inst.getOpcode(); + bool HasVsrc1X = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1X) != -1; + bool HasImmX = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 || + (HasVsrc1X && (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) == + FmamkOpXImmMCIndex || + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) == + FmaakOpXImmMCIndex)); + + bool HasVsrc1Y = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vsrc1Y) != -1; + bool HasImmY = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::immDeferred) != -1 || + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::imm) >= + MinOpYImmMCIndex + HasVsrc1X; + + // Indices of parsed operands relative to dst + const auto DstIdx = 0; + const auto Src0Idx = 1; + const auto Vsrc1OrImmIdx = 2; + + const auto OpXOperandsSize = 2 + HasImmX + HasVsrc1X; + const auto BridgeTokensSize = 2; // Special VOPD tokens ('::' and OpYMnemo) + + // Offsets into parsed operands + const auto OpXFirstOperandOffset = 1; + const auto OpYFirstOperandOffset = + OpXFirstOperandOffset + OpXOperandsSize + BridgeTokensSize; + + // Order of addOp calls determines MC operand order + addOp(OpXFirstOperandOffset + DstIdx); // vdstX + addOp(OpYFirstOperandOffset + DstIdx); // vdstY + + addOp(OpXFirstOperandOffset + Src0Idx); // src0X + if (HasImmX) { + // immX then vsrc1X for fmamk, vsrc1X then immX for fmaak + addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx); + addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx + 1); + } else { + if (HasVsrc1X) // all except v_mov + addOp(OpXFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1X + } + + addOp(OpYFirstOperandOffset + Src0Idx); // src0Y + if (HasImmY) { + // immY then vsrc1Y for fmamk, vsrc1Y then immY for fmaak + addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx); + addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx + 1); + } else { + if (HasVsrc1Y) // all except v_mov + addOp(OpYFirstOperandOffset + Vsrc1OrImmIdx); // vsrc1Y + } +} + +//===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// @@ -8067,6 +8747,88 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); } +// Add dummy $old operand +void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst, + const OperandVector &Operands, + bool IsDPP8) { + Inst.addOperand(MCOperand::createReg(0)); + cvtVOP3DPP(Inst, Operands, IsDPP8); +} + +void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { + OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + bool HasModifiers = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1; + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + int Fi = 0; + for (unsigned E = Operands.size(); I != E; ++I) { + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), + MCOI::TIED_TO); + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + // handle tied old or src2 for MAC instructions + Inst.addOperand(Inst.getOperand(TiedTo)); + } + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + // Add the register arguments + if (IsDPP8 && Op.isFI()) { + Fi = Op.getImm(); + } else if (HasModifiers && + isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + } else if (Op.isImm() && + Desc.OpInfo[Inst.getNumOperands()].RegClass != -1) { + assert(!HasModifiers && "Case should be unreachable with modifiers"); + assert(!Op.IsImmKindLiteral() && "Cannot use literal with DPP"); + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("unhandled operand type"); + } + } + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + } + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + } + if (Desc.TSFlags & SIInstrFlags::VOP3P) + cvtVOP3P(Inst, Operands, OptionalIdx); + else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel); + } + + if (IsDPP8) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDPP8); + using namespace llvm::AMDGPU::DPP; + Inst.addOperand(MCOperand::createImm(Fi? DPP8_FI_1 : DPP8_FI_0)); + } else { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppCtrl, 0xe4); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::fi) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi); + } + } +} + +// Add dummy $old operand +void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst, + const OperandVector &Operands, + bool IsDPP8) { + Inst.addOperand(MCOperand::createReg(0)); + cvtDPP(Inst, Operands, IsDPP8); +} + void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) { OptionalImmIndexMap OptionalIdx; @@ -8352,7 +9114,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() { #define GET_MNEMONIC_CHECKER #include "AMDGPUGenAsmMatcher.inc" -// This fuction should be defined after auto-generated include so that we have +// This function should be defined after auto-generated include so that we have // MatchClassKind enum defined unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) { @@ -8431,3 +9193,27 @@ OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) { } bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); } + +//===----------------------------------------------------------------------===// +// LDSDIR +//===----------------------------------------------------------------------===// + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitVDST() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitVDST); +} + +bool AMDGPUOperand::isWaitVDST() const { + return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm()); +} + +//===----------------------------------------------------------------------===// +// VINTERP +//===----------------------------------------------------------------------===// + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitEXP() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitEXP); +} + +bool AMDGPUOperand::isWaitEXP() const { + return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm()); +} diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a535c8cc0918..a087323e5de7 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -35,11 +35,6 @@ class MUBUFAddr64Table <bit is_addr64, string Name> { string OpName = Name; } -class MUBUFLdsTable <bit is_lds, string Name> { - bit IsLds = is_lds; - string OpName = Name; -} - class MTBUFAddr64Table <bit is_addr64, string Name> { bit IsAddr64 = is_addr64; string OpName = Name; @@ -100,8 +95,8 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> sccb_value = 0; } -class MTBUF_Real <MTBUF_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { +class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -136,7 +131,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> : bits<3> nfmt = format{6-4}; // GFX90A+ only: instruction uses AccVGPR for data - // Bit superceedes tfe. + // Bit supersedes tfe. bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } @@ -320,7 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> idxen = 0; bits<1> addr64 = 0; bits<1> lds = 0; - bits<1> has_vdata = 1; + bits<1> has_vdata = !not(lds); bits<1> has_vaddr = 1; bits<1> has_glc = 1; bits<1> has_dlc = 1; @@ -337,8 +332,8 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> IsBufferInv = 0; } -class MUBUF_Real <MUBUF_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { +class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -360,6 +355,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> : let mayStore = ps.mayStore; let IsAtomicRet = ps.IsAtomicRet; let IsAtomicNoRet = ps.IsAtomicNoRet; + let VALU = ps.VALU; + let LGKM_CNT = ps.LGKM_CNT; bits<12> offset; bits<5> cpol; @@ -370,8 +367,8 @@ class MUBUF_Real <MUBUF_Pseudo ps> : bits<8> soffset; // GFX90A+ only: instruction uses AccVGPR for data - // Bit superceedes tfe. - bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); + // Bit supersedes tfe. + bits<1> acc = !if(ps.has_vdata, vdata{9}, !if(ps.lds, ?, 0)); } @@ -486,16 +483,17 @@ class MUBUF_Load_Pseudo <string opName, ValueType vdata_vt, bit HasTiedDest = 0, bit isLds = 0, + bit isLdsOpc = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret, RegisterOperand vdata_op = getLdStRegisterOperand<vdata_rc>.ret> : MUBUF_Pseudo<opName, - (outs vdata_op:$vdata), + !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" # + !if(!or(isLds, isLdsOpc), " ", " $vdata, ") # getMUBUFAsmOps<addrKindCopy>.ret # "$cpol" # !if(isLds, " lds", "$tfe") # "$swz", pattern>, MUBUF_SetupAddr<addrKindCopy> { @@ -504,13 +502,16 @@ class MUBUF_Load_Pseudo <string opName, let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf"); let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); + let LGKM_CNT = isLds; + let has_vdata = !not(isLdsOpc); let mayLoad = 1; - let mayStore = 0; + let mayStore = isLds; let maybeAtomic = 1; - let Uses = !if(isLds, [EXEC, M0], [EXEC]); + let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]); let has_tfe = !not(isLds); let lds = isLds; let elements = getMUBUFElements<vdata_vt>.ret; + let VALU = isLds; } class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < @@ -563,6 +564,20 @@ multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> { defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>; } +multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName, + ValueType load_vt = i32, + bit TiedDest = 0, + bit isLds = 0, + bit isLdsOpc = 1> { + + defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt); + + def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, isLdsOpc>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc>; +} + class MUBUF_Store_Pseudo <string opName, int addrKind, ValueType store_vt, @@ -615,7 +630,8 @@ class MUBUF_Pseudo_Store_Lds<string opName> (outs), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz), " $srsrc, $soffset$offset lds$cpol$swz"> { - let mayLoad = 0; + let LGKM_CNT = 1; + let mayLoad = 1; let mayStore = 1; let maybeAtomic = 1; @@ -623,6 +639,7 @@ class MUBUF_Pseudo_Store_Lds<string opName> let has_vaddr = 0; let has_tfe = 0; let lds = 1; + let VALU = 1; let Uses = [EXEC, M0]; let AsmMatchConverter = "cvtMubufLds"; @@ -785,7 +802,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, multiclass MUBUF_Pseudo_Atomics <string opName, RegisterClass vdataClass, ValueType vdataType, - SDPatternOperator atomic> : + SDPatternOperator atomic = null_frag> : MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>, MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>; @@ -898,6 +915,29 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < "buffer_load_dwordx4", v4i32 >; +defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_b32", i32 +>; +defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_format_x", f32 +>; +defm BUFFER_LOAD_LDS_I8 : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_i8", i32 +>; +defm BUFFER_LOAD_LDS_I16 : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_i16", i32 +>; +defm BUFFER_LOAD_LDS_U8 : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_u8", i32 +>; +defm BUFFER_LOAD_LDS_U16 : MUBUF_Pseudo_Loads_LDSOpc < + "buffer_load_lds_u16", i32 +>; + +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; @@ -909,21 +949,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; -// This is not described in AMD documentation, -// but 'lds' versions of these opcodes are available -// in at least GFX8+ chips. See Bug 37653. -let SubtargetPredicate = isGFX8GFX9 in { -defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx2", v2i32, 0, 1 ->; -defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", v3i32, 0, 1 ->; -defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", v4i32, 0, 1 ->; -} - defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < "buffer_store_byte", i32, truncstorei8_global >; @@ -943,82 +968,82 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < "buffer_store_dwordx4", v4i32, store_global >; defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32 + "buffer_atomic_swap", VGPR_32, i32 >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag + "buffer_atomic_cmpswap", VReg_64, v2i32 >; defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics < - "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32 + "buffer_atomic_add", VGPR_32, i32 >; defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32 + "buffer_atomic_sub", VGPR_32, i32 >; defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32 + "buffer_atomic_smin", VGPR_32, i32 >; defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32 + "buffer_atomic_umin", VGPR_32, i32 >; defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32 + "buffer_atomic_smax", VGPR_32, i32 >; defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32 + "buffer_atomic_umax", VGPR_32, i32 >; defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics < - "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32 + "buffer_atomic_and", VGPR_32, i32 >; defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics < - "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32 + "buffer_atomic_or", VGPR_32, i32 >; defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32 + "buffer_atomic_xor", VGPR_32, i32 >; defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32 + "buffer_atomic_inc", VGPR_32, i32 >; defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32 + "buffer_atomic_dec", VGPR_32, i32 >; defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64 + "buffer_atomic_swap_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag + "buffer_atomic_cmpswap_x2", VReg_128, v2i64 >; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64 + "buffer_atomic_add_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64 + "buffer_atomic_sub_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64 + "buffer_atomic_smin_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64 + "buffer_atomic_umin_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64 + "buffer_atomic_smax_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64 + "buffer_atomic_umax_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64 + "buffer_atomic_and_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64 + "buffer_atomic_or_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64 + "buffer_atomic_xor_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64 + "buffer_atomic_inc_x2", VReg_64, i64 >; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64 + "buffer_atomic_dec_x2", VReg_64, i64 >; let SubtargetPredicate = HasGFX10_BEncoding in @@ -1040,7 +1065,7 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; } -let SubtargetPredicate = isGFX6GFX7GFX10 in { +let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag @@ -1051,6 +1076,11 @@ defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < "buffer_atomic_fmax", VGPR_32, f32, null_frag >; + +} + +let SubtargetPredicate = isGFX6GFX7GFX10 in { + defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag >; @@ -1109,23 +1139,25 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; -let SubtargetPredicate = HasAtomicFaddInsts in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN< "buffer_atomic_add_f32", VGPR_32, f32 >; + +let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; -let OtherPredicates = [isGFX90APlus] in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN < +let OtherPredicates = [HasAtomicFaddRtnInsts] in +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 >; + +let OtherPredicates = [isGFX90APlus] in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32 >; -} -} // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -1175,15 +1207,28 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", let SubtargetPredicate = isGFX90APlus in { def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> { + let has_glc = 1; + let has_sccb = 1; + let InOperandList = (ins CPol_0:$cpol); + let AsmOperands = "$cpol"; } def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> { + let SubtargetPredicate = isGFX90AOnly; } - defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; - defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; - defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus +def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> { + let SubtargetPredicate = isGFX940Plus; + let has_glc = 1; + let has_sccb = 1; + let InOperandList = (ins CPol_0:$cpol); + let AsmOperands = "$cpol"; +} + let SubtargetPredicate = isGFX10Plus in { def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; @@ -1364,75 +1409,169 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; // buffer_atomic patterns //===----------------------------------------------------------------------===// -multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, - string opcode> { +multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { + foreach RtnMode = ["ret", "noret"] in { + + defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode + # !if(isIntr, "", "_" # vt.Size)); + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + + def : GCNPat< + (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)), + (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) + >; + + def : GCNPat< + (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), + vt:$vdata_in)), + (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, + VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) + >; + + } // end foreach RtnMode +} + +multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> { + defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>; +} + +multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { + foreach RtnMode = ["ret", "noret"] in { + + defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode + # "_" # vt.Size); + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + + defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) + getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset); + def : GCNPat< + (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), data_vt:$vdata_in)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)), + !if(!eq(vt, i32), sub0, sub0_sub1)), + OffsetResDag) + >; + + defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) + getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset); + def : GCNPat< + (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), + data_vt:$vdata_in)), + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)), + !if(!eq(vt, i32), sub0, sub0_sub1)), + Addr64ResDag) + >; + + } // end foreach RtnMode +} + +foreach Ty = [i32, i64] in { + +defvar Suffix = !if(!eq(Ty, i64), "_X2", ""); + +defm : BufferAtomicPat<"atomic_swap_global", Ty, "BUFFER_ATOMIC_SWAP" # Suffix>; +defm : BufferAtomicPat<"atomic_load_add_global", Ty, "BUFFER_ATOMIC_ADD" # Suffix>; +defm : BufferAtomicPat<"atomic_load_sub_global", Ty, "BUFFER_ATOMIC_SUB" # Suffix>; +defm : BufferAtomicPat<"atomic_load_min_global", Ty, "BUFFER_ATOMIC_SMIN" # Suffix>; +defm : BufferAtomicPat<"atomic_load_umin_global", Ty, "BUFFER_ATOMIC_UMIN" # Suffix>; +defm : BufferAtomicPat<"atomic_load_max_global", Ty, "BUFFER_ATOMIC_SMAX" # Suffix>; +defm : BufferAtomicPat<"atomic_load_umax_global", Ty, "BUFFER_ATOMIC_UMAX" # Suffix>; +defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>; +defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>; +defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>; +defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>; +defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>; + +} // end foreach Ty + +defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">; +defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">; + +multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, + list<string> RtnModes = ["ret", "noret"]> { + foreach RtnMode = RtnModes in { + + defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"), + OpPrefix, OpPrefix # "_" # RtnMode)); + defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), + "_RTN", ""); + defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), + (set_glc $cachepolicy), (timm:$cachepolicy)); + def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) + (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (set_glc $cachepolicy)) + (as_i16timm $offset), CachePolicy) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm)), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (set_glc $cachepolicy)) + (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix) + getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, + SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (set_glc $cachepolicy)) + (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix) + getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, + SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, + (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, timm)), - (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) + (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (set_glc $cachepolicy)) + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; + + } // end foreach RtnMode } -defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i32, "BUFFER_ATOMIC_SWAP">; -defm : BufferAtomicPatterns<SIbuffer_atomic_swap, f32, "BUFFER_ATOMIC_SWAP">; -defm : BufferAtomicPatterns<SIbuffer_atomic_add, i32, "BUFFER_ATOMIC_ADD">; -defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i32, "BUFFER_ATOMIC_SUB">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i32, "BUFFER_ATOMIC_SMIN">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i32, "BUFFER_ATOMIC_UMIN">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i32, "BUFFER_ATOMIC_SMAX">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">; -defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">; -defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">; -defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">; -defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">; -defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">; -defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">; -defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smin, i64, "BUFFER_ATOMIC_SMIN_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umin, i64, "BUFFER_ATOMIC_UMIN_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_smax, i64, "BUFFER_ATOMIC_SMAX_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">; -defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i32, "BUFFER_ATOMIC_SWAP">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", f32, "BUFFER_ATOMIC_SWAP">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i32, "BUFFER_ATOMIC_ADD">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i32, "BUFFER_ATOMIC_SUB">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i32, "BUFFER_ATOMIC_SMIN">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i32, "BUFFER_ATOMIC_UMIN">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i32, "BUFFER_ATOMIC_SMAX">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i32, "BUFFER_ATOMIC_UMAX">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i32, "BUFFER_ATOMIC_AND">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>; +defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_smin", i64, "BUFFER_ATOMIC_SMIN_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_umin", i64, "BUFFER_ATOMIC_UMIN_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_smax", i64, "BUFFER_ATOMIC_SMAX_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_umax", i64, "BUFFER_ATOMIC_UMAX_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_and", i64, "BUFFER_ATOMIC_AND_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i64, "BUFFER_ATOMIC_OR_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">; +defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; +let SubtargetPredicate = isGFX6GFX7GFX10Plus in { + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; +} let SubtargetPredicate = isGFX6GFX7GFX10 in { - defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_FMIN_X2">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_FMAX_X2">; } class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag < @@ -1482,71 +1621,89 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, >; } -let SubtargetPredicate = HasAtomicFaddInsts in { +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; + +let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; -} + +let SubtargetPredicate = HasAtomicFaddRtnInsts in + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">; let SubtargetPredicate = isGFX90APlus in { - defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; + defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; + defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; + defm : BufferAtomicIntrPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fadd, f64, "BUFFER_ATOMIC_ADD_F64">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_MIN_F64">; - defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_MAX_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; + defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; } // End SubtargetPredicate = isGFX90APlus +foreach RtnMode = ["ret", "noret"] in { + +defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode); +defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); +defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), + (timm:$cachepolicy)); + +defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix) + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy); def : GCNPat< - (SIbuffer_atomic_cmpswap + (Op i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (set_glc $cachepolicy)), VReg_64)), sub0) + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0), + OffsetResDag) >; +defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix) + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + CachePolicy); def : GCNPat< - (SIbuffer_atomic_cmpswap + (Op i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (set_glc $cachepolicy)), VReg_64)), - sub0) + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0), + IdxenResDag) >; +defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix) + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + CachePolicy); def : GCNPat< - (SIbuffer_atomic_cmpswap + (Op i32:$data, i32:$cmp, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (set_glc $cachepolicy)), VReg_64)), - sub0) + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0), + OffenResDag) >; +defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix) + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy); def : GCNPat< - (SIbuffer_atomic_cmpswap + (Op i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS - (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN - (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (set_glc $cachepolicy)), VReg_64)), - sub0) + !if(!eq(RtnMode, "ret"), + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0), + BothenResDag) >; +} // end foreach RtnMode + class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, @@ -1682,8 +1839,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In >; } let SubtargetPredicate = isGFX6GFX7 in { -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>; -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i32, atomic_store_8_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_BYTE_ADDR64, BUFFER_STORE_BYTE_OFFSET, i16, atomic_store_8_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i32, atomic_store_16_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_SHORT_ADDR64, BUFFER_STORE_SHORT_OFFSET, i16, atomic_store_16_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_32_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_64_global>; } // End Predicates = isGFX6GFX7 @@ -1731,7 +1892,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OF defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>; -let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { @@ -1882,24 +2043,41 @@ let SubtargetPredicate = HasPackedD16VMem in { //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Base ENC_MUBUF for GFX6, GFX7, GFX10. +// Base ENC_MUBUF for GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// -class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : - MUBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { +class Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11 <MUBUF_Pseudo ps, int ef, + string real_name = ps.Mnemonic> : + MUBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{31-26} = 0x38; + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +class MUBUF_Real_gfx11<bits<8> op, MUBUF_Pseudo ps, + string real_name = ps.Mnemonic> : + Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> { + let Inst{12} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); + let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); + let Inst{25-18} = op; + let Inst{53} = !if(ps.has_tfe, tfe, ?); + let Inst{54} = ps.offen; + let Inst{55} = ps.idxen; +} + +class Base_MUBUF_Real_gfx6_gfx7_gfx10<bits<7> op, MUBUF_Pseudo ps, int ef> : + Base_MUBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> { let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); let Inst{16} = ps.lds; let Inst{24-18} = op; - let Inst{31-26} = 0x38; - let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); - let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); - let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> : @@ -1914,10 +2092,155 @@ class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> : } //===----------------------------------------------------------------------===// +// MUBUF - GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in +multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name> { + def _BOTHEN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>, + AtomicNoRet<NAME # "_BOTHEN_gfx11", 0>; + def _IDXEN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN"), real_name>, + AtomicNoRet<NAME # "_IDXEN_gfx11", 0>; + def _OFFEN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN"), real_name>, + AtomicNoRet<NAME # "_OFFEN_gfx11", 0>; + def _OFFSET_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET"), real_name>, + AtomicNoRet<NAME # "_OFFSET_gfx11", 0>; +} + +multiclass MUBUF_Real_AllAddr_gfx11_Impl<bits<8> op, MUBUF_Pseudo ps> : + MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>; +multiclass MUBUF_Real_AllAddr_gfx11<bits<8> op> : + MUBUF_Real_AllAddr_gfx11_Impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; + +class Pre_gfx11_MUBUF_Name <MUBUF_Pseudo ps, string real_name> : + MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>; +multiclass MUBUF_Real_AllAddr_gfx11_Renamed<bits<8> op, string real_name> : + MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> { + def : Pre_gfx11_MUBUF_Name<!cast<MUBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; +} + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in +multiclass MUBUF_Real_Atomics_RTN_gfx11_Renamed<bits<8> op, string real_name> { + def _BOTHEN_RTN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN"), real_name>, + AtomicNoRet<NAME # "_BOTHEN_gfx11", 1>; + def _IDXEN_RTN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN"), real_name>, + AtomicNoRet<NAME # "_IDXEN_gfx11", 1>; + def _OFFEN_RTN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN"), real_name>, + AtomicNoRet<NAME # "_OFFEN_gfx11", 1>; + def _OFFSET_RTN_gfx11 : + MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN"), real_name>, + AtomicNoRet<NAME # "_OFFSET_gfx11", 1>; +} + +multiclass MUBUF_Real_Atomics_RTN_gfx11_impl<bits<8> op, MUBUF_Pseudo ps> : + MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, ps.Mnemonic>; +multiclass MUBUF_Real_Atomics_RTN_gfx11<bits<8> op> : + MUBUF_Real_Atomics_RTN_gfx11_impl<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; + +multiclass MUBUF_Real_Atomics_gfx11<bits<8> op> : + MUBUF_Real_AllAddr_gfx11<op>, + MUBUF_Real_Atomics_RTN_gfx11<op>; + +multiclass MUBUF_Real_Atomics_gfx11_Renamed<bits<8> op, string real_name> : + MUBUF_Real_AllAddr_gfx11_Renamed<op, real_name>, + MUBUF_Real_Atomics_RTN_gfx11_Renamed<op, real_name>; + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { +def BUFFER_GL0_INV_gfx11 : MUBUF_Real_gfx11<0x02B, BUFFER_GL0_INV>; +def BUFFER_GL1_INV_gfx11 : MUBUF_Real_gfx11<0x02C, BUFFER_GL1_INV>; +} + +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x014, "buffer_load_b32">; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x015, "buffer_load_b64">; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x016, "buffer_load_b96">; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x017, "buffer_load_b128">; +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x020, "buffer_load_d16_b16">; +defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x008, "buffer_load_d16_format_x">; +defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x009, "buffer_load_d16_format_xy">; +defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00a, "buffer_load_d16_format_xyz">; +defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00b, "buffer_load_d16_format_xyzw">; +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x023, "buffer_load_d16_hi_b16">; +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x026, "buffer_load_d16_hi_format_x">; +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x022, "buffer_load_d16_hi_i8">; +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x021, "buffer_load_d16_hi_u8">; +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01f, "buffer_load_d16_i8">; +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01e, "buffer_load_d16_u8">; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x000>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x001>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x002>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x003>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x011, "buffer_load_i8">; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x013, "buffer_load_i16">; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x010, "buffer_load_u8">; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x012, "buffer_load_u16">; +defm BUFFER_LOAD_LDS_B32 : MUBUF_Real_AllAddr_gfx11<0x031>; +defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x032>; +defm BUFFER_LOAD_LDS_I8 : MUBUF_Real_AllAddr_gfx11<0x02e>; +defm BUFFER_LOAD_LDS_I16 : MUBUF_Real_AllAddr_gfx11<0x030>; +defm BUFFER_LOAD_LDS_U8 : MUBUF_Real_AllAddr_gfx11<0x02d>; +defm BUFFER_LOAD_LDS_U16 : MUBUF_Real_AllAddr_gfx11<0x02f>; +defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x018, "buffer_store_b8">; +defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x019, "buffer_store_b16">; +defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x01A, "buffer_store_b32">; +defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01B, "buffer_store_b64">; +defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01C, "buffer_store_b96">; +defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01D, "buffer_store_b128">; +defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x00C, "buffer_store_d16_format_x">; +defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x00D, "buffer_store_d16_format_xy">; +defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00E, "buffer_store_d16_format_xyz">; +defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00F, "buffer_store_d16_format_xyzw">; +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x024, "buffer_store_d16_hi_b8">; +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x025, "buffer_store_d16_hi_b16">; +defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x027, "buffer_store_d16_hi_format_x">; +defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x004>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x005>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x006>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x007>; +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomics_gfx11<0x056>; +defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomics_gfx11_Renamed<0x035, "buffer_atomic_add_u32">; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x043, "buffer_atomic_add_u64">; +defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomics_gfx11_Renamed<0x03C, "buffer_atomic_and_b32">; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x049, "buffer_atomic_and_b64">; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x034, "buffer_atomic_cmpswap_b32">; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x042, "buffer_atomic_cmpswap_b64">; +defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">; +defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx11_Renamed<0x037, "buffer_atomic_csub_u32">; +def : MnemonicAlias<"buffer_atomic_csub", "buffer_atomic_csub_u32">, Requires<[isGFX11Plus]>; +defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx11_Renamed<0x040, "buffer_atomic_dec_u32">; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04D, "buffer_atomic_dec_u64">; +defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx11_Renamed<0x03F, "buffer_atomic_inc_u32">; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04C, "buffer_atomic_inc_u64">; +defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x052, "buffer_atomic_max_f32">; +defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x03A, "buffer_atomic_max_i32">; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x047, "buffer_atomic_max_i64">; +defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomics_gfx11_Renamed<0x03B, "buffer_atomic_max_u32">; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x048, "buffer_atomic_max_u64">; +defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x051, "buffer_atomic_min_f32">; +defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x038, "buffer_atomic_min_i32">; +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x045, "buffer_atomic_min_i64">; +defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomics_gfx11_Renamed<0x039, "buffer_atomic_min_u32">; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x046, "buffer_atomic_min_u64">; +defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx11_Renamed<0x03D, "buffer_atomic_or_b32">; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04A, "buffer_atomic_or_b64">; +defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomics_gfx11_Renamed<0x036, "buffer_atomic_sub_u32">; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x044, "buffer_atomic_sub_u64">; +defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomics_gfx11_Renamed<0x033, "buffer_atomic_swap_b32">; +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x041, "buffer_atomic_swap_b64">; +defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx11_Renamed<0x03E, "buffer_atomic_xor_b32">; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx11_Renamed<0x04B, "buffer_atomic_xor_b64">; + +//===----------------------------------------------------------------------===// // MUBUF - GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass MUBUF_Real_AllAddr_gfx10<bits<8> op> { def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; @@ -1929,23 +2252,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; } multiclass MUBUF_Real_AllAddr_Lds_gfx10<bits<8> op> { - def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_gfx10">; - def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_gfx10">; - def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_gfx10">; - def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_gfx10">; + def _OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; - def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_gfx10">; - def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_gfx10">; - def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_gfx10">; - def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">; + def _LDS_OFFSET_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>; + def _LDS_OFFEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>; + def _LDS_IDXEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>; + def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>; } multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> { def _BOTHEN_RTN_gfx10 : @@ -1976,7 +2291,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, AtomicNoRet<NAME # "_OFFSET_gfx10", 0>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>; @@ -2033,27 +2348,17 @@ let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; } multiclass MUBUF_Real_AllAddr_Lds_gfx6_gfx7<bits<8> op> { - def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_gfx6_gfx7">; - def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>, - MUBUFLdsTable<0, NAME # "_ADDR64_gfx6_gfx7">; - def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_gfx6_gfx7">; - def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_gfx6_gfx7">; - def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_gfx6_gfx7">; + def _OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>; + def _OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; - def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_gfx6_gfx7">; - def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>, - MUBUFLdsTable<1, NAME # "_ADDR64_gfx6_gfx7">; - def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_gfx6_gfx7">; - def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_gfx6_gfx7">; - def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_gfx6_gfx7">; + def _LDS_OFFSET_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>; + def _LDS_ADDR64_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>; + def _LDS_OFFEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>; + def _LDS_IDXEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>; + def _LDS_BOTHEN_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>; } multiclass MUBUF_Real_Atomics_gfx6_gfx7<bits<8> op> { def _ADDR64_gfx6_gfx7 : @@ -2167,26 +2472,89 @@ defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>; //===----------------------------------------------------------------------===// -// Base ENC_MTBUF for GFX6, GFX7, GFX10. +// Base ENC_MTBUF for GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// -class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : - MTBUF_Real<ps>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { +class Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<MTBUF_Pseudo ps, int ef, + string real_name = ps.Mnemonic> : + MTBUF_Real<ps, real_name>, Enc64, SIMCInstr<ps.PseudoInstr, ef> { let Inst{11-0} = !if(ps.has_offset, offset, ?); - let Inst{12} = ps.offen; - let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); - let Inst{18-16} = op; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +class Base_MTBUF_Real_gfx11<bits<4> op, MTBUF_Pseudo ps, + string real_name = ps.Mnemonic> : + Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, SIEncodingFamily.GFX11, real_name> { + let Inst{12} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); + let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value); + let Inst{18-15} = op; + let Inst{25-19} = format; + let Inst{53} = !if(ps.has_tfe, tfe, ?); + let Inst{54} = ps.offen; + let Inst{55} = ps.idxen; +} + +class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> : + Base_MTBUF_Real_gfx6_gfx7_gfx10_gfx11<ps, ef> { + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{18-16} = op; let Inst{54} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); - let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } //===----------------------------------------------------------------------===// +// MTBUF - GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in +multiclass MTBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<4> op, string real_name> { + def _BOTHEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; + def _IDXEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>; + def _OFFEN_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>; + def _OFFSET_gfx11 : + Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>; +} + +multiclass MTBUF_Real_AllAddr_gfx11_Impl<bits<4> op, MTBUF_Pseudo ps> + : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>; +multiclass MTBUF_Real_AllAddr_gfx11<bits<4> op> + : MTBUF_Real_AllAddr_gfx11_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; + + +class Pre_gfx11_MTBUF_Name <MTBUF_Pseudo ps, string real_name> + : MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>; +multiclass MTBUF_Real_AllAddr_gfx11_Renamed<bits<4> op, string real_name> + : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> { + def : Pre_gfx11_MTBUF_Name<!cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>; +} + +defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x008, "tbuffer_load_d16_format_x">; +defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x009, "tbuffer_load_d16_format_xy">; +defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00a, "tbuffer_load_d16_format_xyz">; +defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x000>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x001>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x002>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x003>; +defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x00c, "tbuffer_store_d16_format_x">; +defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x00d, "tbuffer_store_d16_format_xy">; +defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00e, "tbuffer_store_d16_format_xyz">; +defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x004>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x005>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x006>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x007>; + +//===----------------------------------------------------------------------===// // MTBUF - GFX10. //===----------------------------------------------------------------------===// @@ -2197,7 +2565,7 @@ class MTBUF_Real_gfx10<bits<4> op, MTBUF_Pseudo ps> : let Inst{53} = op{3}; } -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass MTBUF_Real_AllAddr_gfx10<bits<4> op> { def _BOTHEN_gfx10 : MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; @@ -2208,7 +2576,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { def _OFFSET_gfx10 : MTBUF_Real_gfx10<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx10<0x008>; defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx10<0x009>; @@ -2303,9 +2671,28 @@ class MUBUF_Real_gfx90a <bits<7> op, MUBUF_Pseudo ps, let Inst{55} = acc; } +class MUBUF_Real_gfx940 <bits<7> op, MUBUF_Pseudo ps> : + MUBUF_Real_Base_vi<op, ps, SIEncodingFamily.GFX940> { + let AssemblerPredicate = isGFX940Plus; + let DecoderNamespace = "GFX9"; + let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands); + + let Inst{55} = acc; +} + multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps> { def _vi : MUBUF_Real_vi<op, ps>; - def _gfx90a : MUBUF_Real_gfx90a<op, ps, !and(ps.has_sccb,!not(ps.FPAtomic))>; + + foreach _ = BoolToList<!not(ps.FPAtomic)>.ret in + def _gfx90a : MUBUF_Real_gfx90a<op, ps>; + + foreach _ = BoolToList<ps.FPAtomic>.ret in { + def _gfx90a : MUBUF_Real_gfx90a<op, ps, 0> { + let SubtargetPredicate = isGFX90AOnly; + let AssemblerPredicate = isGFX90AOnly; + } + def _gfx940 : MUBUF_Real_gfx940<op, ps>; + } } multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { @@ -2317,41 +2704,25 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> { - def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_vi">; - def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_vi">; - def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_vi">; - def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_vi">; + def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; - def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_vi">; - def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_vi">; - def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_vi">; - def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_vi">; + def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>; + def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>; + def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>; + def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>; - def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, - MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">; - def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, - MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">; - def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, - MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">; - def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, - MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">; + def _OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; - def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, - MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">; - def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, - MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">; - def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, - MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">; - def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, - MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">; + def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>; + def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>; + def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>; + def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>; } class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> : @@ -2424,9 +2795,9 @@ defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_vi <0x11>; defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>; defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>; defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_Lds_vi <0x15>; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>; defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>; defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>; @@ -2481,12 +2852,12 @@ def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; } // End AssemblerPredicate = isGFX8GFX9 -let SubtargetPredicate = HasAtomicFaddInsts in { +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; -} // End SubtargetPredicate = HasAtomicFaddInsts +} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts let SubtargetPredicate = isGFX90APlus in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; @@ -2495,9 +2866,17 @@ let SubtargetPredicate = isGFX90APlus in { } // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> { + let AsmString = BUFFER_WBL2.Mnemonic; // drop flags + let AssemblerPredicate = isGFX90AOnly; + let SubtargetPredicate = isGFX90AOnly; } def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>; +let SubtargetPredicate = isGFX940Plus in { +def BUFFER_WBL2_gfx940 : MUBUF_Real_gfx940<0x28, BUFFER_WBL2>; +def BUFFER_INV_gfx940 : MUBUF_Real_gfx940<0x29, BUFFER_INV>; +} + class MTBUF_Real_Base_vi <bits<4> op, MTBUF_Pseudo ps, int Enc> : MTBUF_Real<ps>, Enc64, diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index c4043177b618..27b723875aa4 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -52,8 +52,8 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]); } -class DS_Real <DS_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, +class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands>, Enc64 { let isPseudo = 0; @@ -72,6 +72,9 @@ class DS_Real <DS_Pseudo ps> : let IsAtomicRet = ps.IsAtomicRet; let IsAtomicNoRet = ps.IsAtomicNoRet; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + // encoding fields bits<10> vdst; bits<1> gds; @@ -172,6 +175,22 @@ multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> { } } +class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc, + RegisterOperand dst_op = getLdStRegisterOperand<rc>.ret, + RegisterOperand src_op = getLdStRegisterOperand<src>.ret> +: DS_Pseudo<opName, + (outs dst_op:$vdst), + (ins src_op:$data0, offset:$offset), + " $vdst, $data0$offset gds"> { + + let has_addr = 0; + let has_data1 = 0; + let has_gds = 0; + let gdsValue = 1; + let AsmMatchConverter = "cvtDSGds"; + let hasSideEffects = 1; +} + class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32, RegisterOperand data_op = getLdStRegisterOperand<rc>.ret> : DS_Pseudo<opName, @@ -462,6 +481,22 @@ let SubtargetPredicate = isGFX90APlus in { defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">; } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = isGFX940Plus in { + defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; + defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; + defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; + defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; +} // End SubtargetPredicate = isGFX940Plus + +defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; +defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">; +defm DS_CMPSTORE_B64 : DS_1A2D_NORET_mc<"ds_cmpstore_b64", VReg_64>; +defm DS_CMPSTORE_F64 : DS_1A2D_NORET_mc<"ds_cmpstore_f64", VReg_64>; +defm DS_CMPSTORE_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b32", VGPR_32, "ds_cmpstore_b32">; +defm DS_CMPSTORE_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f32", VGPR_32, "ds_cmpstore_f32">; +defm DS_CMPSTORE_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_b64", VReg_64, "ds_cmpstore_b64">; +defm DS_CMPSTORE_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpstore_rtn_f64", VReg_64, "ds_cmpstore_f64">; + defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; @@ -619,6 +654,8 @@ def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">; def DS_CONSUME : DS_0A_RET<"ds_consume">; def DS_APPEND : DS_0A_RET<"ds_append">; + +let SubtargetPredicate = isNotGFX90APlus in def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; //===----------------------------------------------------------------------===// @@ -667,6 +704,18 @@ let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] i def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; } + +//===----------------------------------------------------------------------===// +// Instruction definitions for GFX11 and newer. +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGFX11Plus in { + +def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>; +def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>; + +} // let SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// // DS Patterns //===----------------------------------------------------------------------===// @@ -777,14 +826,14 @@ foreach vt = Reg32Types.types in { defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">; } -defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_local_8">; -defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_local_8">; -defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_local_16">; -defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_local_16">; -defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">; -defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">; +defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>; def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>; } @@ -870,15 +919,30 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">; let SubtargetPredicate = HasUnalignedAccessMode in { -// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice -// for unaligned accesses? +// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/ +// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32 +// which would be used otherwise. In this case a b32 access would still be +// misaligned, but we will have 2 of them. +foreach vt = VReg_64.RegTypes in { +defm : DSReadPat_mc <DS_READ_B64, vt, "load_align_less_than_4_local">; +defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align_less_than_4_local">; +} + +// Selection will split most of the unaligned 3 dword accesses due to performance +// reasons when beneficial. Keep these two patterns for the rest of the cases. foreach vt = VReg_96.RegTypes in { defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">; defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">; } -// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned -// accesses. +// Select 128 bit loads and stores aligned less than 4 as a single ds_read_b128/ +// ds_write_b128 instruction as this is faster than ds_read2_b64/ds_write2_b64 +// which would be used otherwise. In this case a b64 access would still be +// misaligned, but we will have 2 of them. +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc <DS_READ_B128, vt, "load_align_less_than_4_local">; +defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; +} } // End SubtargetPredicate = HasUnalignedAccessMode @@ -904,69 +968,143 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; } +multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, + ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + } + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + } -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; +} + + + +let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { +// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. +class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) >; -multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { +multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, + string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; } - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; } +} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 +let SubtargetPredicate = isGFX11Plus in { +// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode. +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), + (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds)) +>; +multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { + + def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, + !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; +} +} // End SubtargetPredicate = isGFX11Plus // 32-bit atomics. defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">; -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">; +defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U32, DS_ADD_U32, i32, "atomic_load_add">; +defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U32, DS_SUB_U32, i32, "atomic_load_sub">; +defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_inc">; +defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_dec">; +defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B32, DS_AND_B32, i32, "atomic_load_and">; +defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B32, DS_OR_B32, i32, "atomic_load_or">; +defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B32, DS_XOR_B32, i32, "atomic_load_xor">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I32, DS_MIN_I32, i32, "atomic_load_min">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I32, DS_MAX_I32, i32, "atomic_load_max">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U32, DS_MIN_U32, i32, "atomic_load_umin">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U32, DS_MAX_U32, i32, "atomic_load_umax">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F32, DS_MIN_F32, f32, "atomic_load_fmin">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F32, DS_MAX_F32, f32, "atomic_load_fmax">; + +let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { +defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">; +} + +let SubtargetPredicate = isGFX11Plus in { +defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">; +} let SubtargetPredicate = HasLDSFPAtomicAdd in { -defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">; +defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">; } // 64-bit atomics. defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_F64, f64, "atomic_load_fmin">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_F64, f64, "atomic_load_fmax">; +defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U64, DS_ADD_U64, i64, "atomic_load_add">; +defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U64, DS_SUB_U64, i64, "atomic_load_sub">; +defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_inc">; +defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_dec">; +defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B64, DS_AND_B64, i64, "atomic_load_and">; +defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B64, DS_OR_B64, i64, "atomic_load_or">; +defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B64, DS_XOR_B64, i64, "atomic_load_xor">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_I64, DS_MIN_I64, i64, "atomic_load_min">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_I64, DS_MAX_I64, i64, "atomic_load_max">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_U64, DS_MIN_U64, i64, "atomic_load_umin">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_U64, DS_MAX_U64, i64, "atomic_load_umax">; +defm : DSAtomicRetNoRetPat_mc<DS_MIN_RTN_F64, DS_MIN_F64, f64, "atomic_load_fmin">; +defm : DSAtomicRetNoRetPat_mc<DS_MAX_RTN_F64, DS_MAX_F64, f64, "atomic_load_fmax">; + +let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { +defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B64, DS_CMPST_B64, i64, "atomic_cmp_swap">; +} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">; +let SubtargetPredicate = isGFX11Plus in { +defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp_swap">; +} // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX90APlus in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>; +def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; +} + +let SubtargetPredicate = isGFX940Plus in { +def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>; +def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; +def : GCNPat < + (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)), + (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) +>; +def : GCNPat < + (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), + (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) +>; } def : Pat < @@ -974,16 +1112,44 @@ def : Pat < (DS_ORDERED_COUNT $value, (as_i16imm $offset)) >; +def : GCNPat < + (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), + (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) +>; + +def : GCNPat < + (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), + (EXTRACT_SUBREG + (i64 (COPY_TO_REGCLASS + (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + VReg_64)), + sub0) +>; + +def : GCNPat < + (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), + (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) +>; + +def : GCNPat < + (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), + (EXTRACT_SUBREG + (i64 (COPY_TO_REGCLASS + (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + VReg_64)), + sub0) +>; + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Base ENC_DS for GFX6, GFX7, GFX10. +// Base ENC_DS for GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// -class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> : - DS_Real<ps>, SIMCInstr <ps.Mnemonic, ef> { +class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic> : + DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> { let Inst{7-0} = !if(ps.has_offset0, offset0, 0); let Inst{15-8} = !if(ps.has_offset1, offset1, 0); @@ -997,19 +1163,89 @@ class Base_DS_Real_gfx6_gfx7_gfx10<bits<8> op, DS_Pseudo ps, int ef> : } //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in { + multiclass DS_Real_gfx11<bits<8> op> { + def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), + SIEncodingFamily.GFX11>; + } + + multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> { + def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; + } +} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" + +defm DS_STORE_B32 : DS_Real_Renamed_gfx11<0x00d, DS_WRITE_B32, "ds_store_b32">; +defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">; +defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">; +defm DS_STORE_B8 : DS_Real_Renamed_gfx11<0x01e, DS_WRITE_B8, "ds_store_b8">; +defm DS_STORE_B16 : DS_Real_Renamed_gfx11<0x01f, DS_WRITE_B16, "ds_store_b16">; +defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">; +defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">; +defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">; +defm DS_LOAD_B32 : DS_Real_Renamed_gfx11<0x036, DS_READ_B32, "ds_load_b32">; +defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11<0x037, DS_READ2_B32, "ds_load_2addr_b32">; +defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">; +defm DS_LOAD_I8 : DS_Real_Renamed_gfx11<0x039, DS_READ_I8, "ds_load_i8">; +defm DS_LOAD_U8 : DS_Real_Renamed_gfx11<0x03a, DS_READ_U8, "ds_load_u8">; +defm DS_LOAD_I16 : DS_Real_Renamed_gfx11<0x03b, DS_READ_I16, "ds_load_i16">; +defm DS_LOAD_U16 : DS_Real_Renamed_gfx11<0x03c, DS_READ_U16, "ds_load_u16">; +defm DS_STORE_B64 : DS_Real_Renamed_gfx11<0x04d, DS_WRITE_B64, "ds_store_b64">; +defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">; +defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">; +defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">; +defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">; +defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">; +defm DS_LOAD_B64 : DS_Real_Renamed_gfx11<0x076, DS_READ_B64, "ds_load_b64">; +defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11<0x077, DS_READ2_B64, "ds_load_2addr_b64">; +defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">; +defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">; +defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">; +defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">; +defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">; +defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">; +defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">; +defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">; +defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">; +defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">; +defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">; +defm DS_STORE_B96 : DS_Real_Renamed_gfx11<0x0de, DS_WRITE_B96, "ds_store_b96">; +defm DS_STORE_B128 : DS_Real_Renamed_gfx11<0x0df, DS_WRITE_B128, "ds_store_b128">; +defm DS_LOAD_B96 : DS_Real_Renamed_gfx11<0x0fe, DS_READ_B96, "ds_load_b96">; +defm DS_LOAD_B128 : DS_Real_Renamed_gfx11<0x0ff, DS_READ_B128, "ds_load_b128">; + +// DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped +// comparing to pre-GFX11. +// Note: the mnemonic alias is not generated to avoid a potential ambiguity due to the semantics change. + +defm DS_CMPSTORE_B32 : DS_Real_gfx11<0x010>; +defm DS_CMPSTORE_F32 : DS_Real_gfx11<0x011>; +defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11<0x030>; +defm DS_CMPSTORE_RTN_F32 : DS_Real_gfx11<0x031>; +defm DS_CMPSTORE_B64 : DS_Real_gfx11<0x050>; +defm DS_CMPSTORE_F64 : DS_Real_gfx11<0x051>; +defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11<0x070>; +defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>; + +defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>; +defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>; +defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>; + +//===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass DS_Real_gfx10<bits<8> op> { - def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm DS_ADD_F32 : DS_Real_gfx10<0x015>; defm DS_ADD_RTN_F32 : DS_Real_gfx10<0x055>; -defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; defm DS_WRITE_B8_D16_HI : DS_Real_gfx10<0x0a0>; defm DS_WRITE_B16_D16_HI : DS_Real_gfx10<0x0a1>; defm DS_READ_U8_D16 : DS_Real_gfx10<0x0a2>; @@ -1020,95 +1256,118 @@ defm DS_READ_U16_D16 : DS_Real_gfx10<0x0a6>; defm DS_READ_U16_D16_HI : DS_Real_gfx10<0x0a7>; defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>; defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>; -defm DS_PERMUTE_B32 : DS_Real_gfx10<0x0b2>; -defm DS_BPERMUTE_B32 : DS_Real_gfx10<0x0b3>; //===----------------------------------------------------------------------===// -// GFX7, GFX10. +// GFX10, GFX11. +//===----------------------------------------------------------------------===// + +multiclass DS_Real_gfx10_gfx11<bits<8> op> : + DS_Real_gfx10<op>, DS_Real_gfx11<op>; + +defm DS_ADD_F32 : DS_Real_gfx10_gfx11<0x015>; +defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>; +defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b2>; +defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b3>; + +//===----------------------------------------------------------------------===// +// GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in { multiclass DS_Real_gfx7<bits<8> op> { - def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" +multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> : + DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>; + multiclass DS_Real_gfx7_gfx10<bits<8> op> : DS_Real_gfx7<op>, DS_Real_gfx10<op>; // FIXME-GFX7: Add tests when upstreaming this part. -defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10<0x018>; -defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10<0x034>; -defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10<0x07e>; +defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>; +defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>; +defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11<0x07e>; defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>; defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>; defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>; defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>; //===----------------------------------------------------------------------===// -// GFX6, GFX7, GFX10. +// GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { multiclass DS_Real_gfx6_gfx7<bits<8> op> { - def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10<op, !cast<DS_Pseudo>(NAME), + def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>; } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" +multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> : + DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>; + multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> : DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>; -defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10<0x000>; -defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x001>; -defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10<0x002>; -defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10<0x003>; -defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10<0x004>; -defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10<0x005>; -defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10<0x006>; -defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10<0x007>; -defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10<0x008>; -defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10<0x009>; -defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00a>; -defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00b>; -defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10<0x00c>; +defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x000>; +defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x001>; +defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x002>; +defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x003>; +defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x004>; +defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x005>; +defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x006>; +defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x007>; +defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x008>; +defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x009>; +defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00a>; +defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00b>; +defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00c>; + defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>; defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>; defm DS_WRITE2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x00f>; defm DS_CMPST_B32 : DS_Real_gfx6_gfx7_gfx10<0x010>; defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>; -defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10<0x012>; -defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10<0x013>; -defm DS_NOP : DS_Real_gfx6_gfx7_gfx10<0x014>; -defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10<0x019>; -defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10<0x01a>; -defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10<0x01b>; -defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10<0x01c>; -defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10<0x01d>; + +defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>; +defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>; +defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11<0x014>; +defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>; +defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>; +defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>; +defm DS_GWS_SEMA_P : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01c>; +defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>; + defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>; defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>; -defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x020>; -defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x021>; -defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x022>; -defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x023>; -defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x024>; -defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x025>; -defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10<0x026>; -defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x027>; -defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10<0x028>; -defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x029>; -defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02a>; -defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02b>; -defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02c>; + +defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x020>; +defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x021>; +defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x022>; +defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x023>; +defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x024>; +defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x025>; +defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x026>; +defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x027>; +defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x028>; +defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x029>; +defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02a>; +defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; +defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02c>; + defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>; defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>; defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02f>; defm DS_CMPST_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x030>; defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>; -defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x032>; -defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x033>; -defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10<0x035>; + +defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>; +defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>; +defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x035>; + defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>; defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>; defm DS_READ2ST64_B32 : DS_Real_gfx6_gfx7_gfx10<0x038>; @@ -1116,49 +1375,55 @@ defm DS_READ_I8 : DS_Real_gfx6_gfx7_gfx10<0x039>; defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>; defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>; defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>; -defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10<0x03d>; -defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10<0x03e>; -defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10<0x03f>; -defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10<0x040>; -defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x041>; -defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10<0x042>; -defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10<0x043>; -defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10<0x044>; -defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10<0x045>; -defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10<0x046>; -defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10<0x047>; -defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10<0x048>; -defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10<0x049>; -defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04a>; -defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04b>; -defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10<0x04c>; + +defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03d>; +defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03e>; +defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>; +defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x040>; +defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x041>; +defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x042>; +defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x043>; +defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x044>; +defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x045>; +defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x046>; +defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x047>; +defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x048>; +defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x049>; +defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04a>; +defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04b>; +defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04c>; + defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>; defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>; defm DS_WRITE2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x04f>; defm DS_CMPST_B64 : DS_Real_gfx6_gfx7_gfx10<0x050>; defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>; -defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10<0x052>; -defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10<0x053>; -defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x060>; -defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x061>; -defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x062>; -defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x063>; -defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x064>; -defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x065>; -defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10<0x066>; -defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x067>; -defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10<0x068>; -defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x069>; -defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06a>; -defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06b>; -defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06c>; + +defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x052>; +defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x053>; +defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x060>; +defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x061>; +defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x062>; +defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x063>; +defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x064>; +defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x065>; +defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x066>; +defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x067>; +defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x068>; +defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x069>; +defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06a>; +defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06b>; +defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06c>; + defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>; defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>; defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06f>; defm DS_CMPST_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x070>; defm DS_CMPST_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x071>; -defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x072>; -defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10<0x073>; + +defm DS_MIN_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x072>; +defm DS_MAX_RTN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x073>; + defm DS_READ_B64 : DS_Real_gfx6_gfx7_gfx10<0x076>; defm DS_READ2_B64 : DS_Real_gfx6_gfx7_gfx10<0x077>; defm DS_READ2ST64_B64 : DS_Real_gfx6_gfx7_gfx10<0x078>; @@ -1381,3 +1646,10 @@ let SubtargetPredicate = isGFX90APlus in { def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; } // End SubtargetPredicate = isGFX90APlus + +let SubtargetPredicate = isGFX940Plus in { + def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; + def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; + def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; + def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>; +} // End SubtargetPredicate = isGFX940Plus diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index e2186d4d533e..ccaf646008b1 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -18,15 +18,20 @@ #include "Disassembler/AMDGPUDisassembler.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "SIRegisterInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/DisassemblerTypes.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDecoderOps.h" #include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCFixedLenDisassembler.h" -#include "llvm/MC/TargetRegistry.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" using namespace llvm; @@ -70,7 +75,8 @@ static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, } static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); // Our branches take a simm16, but we need two extra bits to account for the @@ -78,13 +84,13 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, APInt SignedOffset(18, Imm * 4, true); int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue(); - if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2)) + if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2, 0)) return MCDisassembler::Success; return addOperand(Inst, MCOperand::createImm(Imm)); } -static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { +static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); int64_t Offset; if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets. @@ -95,20 +101,19 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Offset)); } -static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, - uint64_t Addr, const void *Decoder) { +static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeBoolReg(Val)); } -#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ -static DecodeStatus StaticDecoderName(MCInst &Inst, \ - unsigned Imm, \ - uint64_t /*Addr*/, \ - const void *Decoder) { \ - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ - return addOperand(Inst, DAsm->DecoderName(Imm)); \ -} +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ + static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \ + uint64_t /*Addr*/, \ + const MCDisassembler *Decoder) { \ + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \ + return addOperand(Inst, DAsm->DecoderName(Imm)); \ + } #define DECODE_OPERAND_REG(RegClass) \ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) @@ -144,155 +149,151 @@ DECODE_OPERAND_REG(AReg_512) DECODE_OPERAND_REG(AReg_1024) DECODE_OPERAND_REG(AV_32) DECODE_OPERAND_REG(AV_64) +DECODE_OPERAND_REG(AV_128) +DECODE_OPERAND_REG(AVDst_128) +DECODE_OPERAND_REG(AVDst_512) -static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); } -static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, unsigned Imm, + uint64_t Addr, + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } -static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm)); } -static DecodeStatus decodeOperand_VS_16(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); } -static DecodeStatus decodeOperand_VS_32(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VS_32(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); } -static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512)); } -static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); } -static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512)); } -static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512)); } -static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); } -static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm)); } -static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm)); } -static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm)); } -static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm)); } -static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm)); } static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm)); } static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { + uint64_t Addr, + const MCDisassembler *Decoder) { const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm)); } -static DecodeStatus decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); return addOperand( Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true)); } -static DecodeStatus decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); return addOperand( Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true)); } +static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val, + uint64_t Addr, const void *Decoder) { + const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); + return addOperand(Inst, DAsm->decodeVOPDDstYOp(Inst, Val)); +} + static bool IsAGPROperand(const MCInst &Inst, int OpIdx, const MCRegisterInfo *MRI) { if (OpIdx < 0) @@ -307,10 +308,9 @@ static bool IsAGPROperand(const MCInst &Inst, int OpIdx, return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255; } -static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm, AMDGPUDisassembler::OpWidthTy Opw, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); if (!DAsm->isGFX90A()) { Imm &= 511; @@ -342,54 +342,41 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256)); } -static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW32, Decoder); } -static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW64, Decoder); } -static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW96, Decoder); } -static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { +static DecodeStatus +DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, + const MCDisassembler *Decoder) { return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW128, Decoder); } -static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, - unsigned Imm, +static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm, uint64_t Addr, - const void *Decoder) { + const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm)); } -static DecodeStatus decodeOperand_VGPR_32(MCInst &Inst, - unsigned Imm, - uint64_t Addr, - const void *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW32, Imm)); -} - #define DECODE_SDWA(DecName) \ DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) @@ -410,21 +397,15 @@ template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) { return Res; } -DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, - MCInst &MI, - uint64_t Inst, - uint64_t Address) const { - assert(MI.getOpcode() == 0); - assert(MI.getNumOperands() == 0); - MCInst TmpInst; - HasLiteral = false; - const auto SavedBytes = Bytes; - if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { - MI = TmpInst; - return MCDisassembler::Success; - } - Bytes = SavedBytes; - return MCDisassembler::Fail; +static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) { + assert(Bytes.size() >= 12); + uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>( + Bytes.data()); + Bytes = Bytes.slice(8); + uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>( + Bytes.data()); + Bytes = Bytes.slice(4); + return DecoderUInt128(Lo, Hi); } // The disassembler is greedy, so we need to check FI operand value to @@ -457,6 +438,29 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 // encodings + if (isGFX11Plus() && Bytes.size() >= 12 ) { + DecoderUInt128 DecW = eat12Bytes(Bytes); + Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, + Address); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; + MI = MCInst(); // clear + Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, + Address); + if (Res) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) + convertVOP3PDPPInst(MI); + else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) + convertVOPCDPPInst(MI); + break; + } + Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address); + if (Res) + break; + } + // Reinitialize Bytes + Bytes = Bytes_.slice(0, MaxInstBytesNum); + if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); @@ -475,12 +479,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; + MI = MCInst(); // clear + Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address); + if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) + break; MI = MCInst(); // clear Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address); + if (Res) { + if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) + convertVOPCDPPInst(MI); + break; + } + Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); if (Res) { IsSDWA = true; break; } @@ -535,6 +550,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address); + if (Res) break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; @@ -554,6 +572,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res) break; Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address); + if (Res) + break; + + Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address); } while (false); if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || @@ -565,8 +590,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx11 || MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || - MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10)) { + MI.getOpcode() == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 || + MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx10 || + MI.getOpcode() == AMDGPU::V_FMAC_F16_e64_gfx11)) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); @@ -625,8 +653,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = MCDisassembler::Fail; } else { for (unsigned i = 0; i < NSAArgs; ++i) { - MI.insert(MI.begin() + VAddr0Idx + 1 + i, - decodeOperand_VGPR_32(Bytes[i])); + const unsigned VAddrIdx = VAddr0Idx + 1 + i; + auto VAddrRCID = MCII->get(MI.getOpcode()).OpInfo[VAddrIdx].RegClass; + MI.insert(MI.begin() + VAddrIdx, + createRegOperand(VAddrRCID, Bytes[i])); } Bytes = Bytes.slice(4 * NSAWords); } @@ -636,6 +666,12 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = convertMIMGInst(MI); } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)) + Res = convertEXPInst(MI); + + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)) + Res = convertVINTERPInst(MI); + if (Res && IsSDWA) Res = convertSDWAInst(MI); @@ -667,6 +703,28 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Res; } +DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { + if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) { + // The MCInst still has these fields even though they are no longer encoded + // in the GFX11 instruction. + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm); + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr); + } + return MCDisassembler::Success; +} + +DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { + if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 || + MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) { + // The MCInst has this field that is not directly encoded in the + // instruction. + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel); + } + return MCDisassembler::Success; +} + DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { @@ -692,18 +750,23 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { + convertVOP3PDPPInst(MI); + } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || + AMDGPU::isVOPC64DPP(Opc)) { + convertVOPCDPPInst(MI); + } else { + // Insert dummy unused src modifiers. + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); - // Insert dummy unused src modifiers. - if (MI.getNumOperands() < DescNumOps && - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src0_modifiers); - - if (MI.getNumOperands() < DescNumOps && - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::src1_modifiers); - + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + } return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail; } @@ -745,7 +808,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { bool IsNSA = false; unsigned AddrSize = Info->VAddrDwords; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + if (isGFX10Plus()) { unsigned DimIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim); int A16Idx = @@ -757,7 +820,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI)); - IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA; + IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA || + Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA; if (!IsNSA) { if (AddrSize > 8) AddrSize = 16; @@ -808,9 +872,9 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { } } + // If not using NSA on GFX10+, widen address register to correct size. unsigned NewVAddr0 = AMDGPU::NoRegister; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX10] && !IsNSA && - AddrSize != Info->VAddrDwords) { + if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) { unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg(); unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0); VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0; @@ -844,11 +908,84 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { return MCDisassembler::Success; } +// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen +// decoder only adds to src_modifiers, so manually add the bits to the other +// operands. +DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in); + + const int ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; + unsigned OpSel = 0; + unsigned OpSelHi = 0; + unsigned NegLo = 0; + unsigned NegHi = 0; + for (int J = 0; J < 3; ++J) { + int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); + if (OpIdx == -1) + break; + unsigned Val = MI.getOperand(OpIdx).getImm(); + + OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J; + OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J; + NegLo |= !!(Val & SISrcMods::NEG) << J; + NegHi |= !!(Val & SISrcMods::NEG_HI) << J; + } + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(OpSel), + AMDGPU::OpName::op_sel); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi), + AMDGPU::OpName::op_sel_hi); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(NegLo), + AMDGPU::OpName::neg_lo); + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(NegHi), + AMDGPU::OpName::neg_hi); + + return MCDisassembler::Success; +} + +// Create dummy old operand and insert optional operands +DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::old) != -1) + insertNamedMCOperand(MI, MCOperand::createReg(0), AMDGPU::OpName::old); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src0_modifiers); + + if (MI.getNumOperands() < DescNumOps && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1) + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src1_modifiers); + return MCDisassembler::Success; +} + DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const { assert(HasLiteral && "Should have decoded a literal"); const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); unsigned DescNumOps = Desc.getNumOperands(); + insertNamedMCOperand(MI, MCOperand::createImm(Literal), + AMDGPU::OpName::immDeferred); assert(DescNumOps == MI.getNumOperands()); for (unsigned I = 0; I < DescNumOps; ++I) { auto &Op = MI.getOperand(I); @@ -1001,6 +1138,22 @@ MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AVDst_128(unsigned Val) const { + using namespace AMDGPU::EncValues; + assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1. + return decodeSrcOp(OPW128, Val | IS_VGPR); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AVDst_512(unsigned Val) const { + using namespace AMDGPU::EncValues; + assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1. + return decodeSrcOp(OPW512, Val | IS_VGPR); +} + MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { return createRegOperand(AMDGPU::VReg_64RegClassID, Val); } @@ -1075,6 +1228,9 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { if (HasLiteral) { + assert( + AMDGPU::hasVOPD(STI) && + "Should only decode multiple kimm with VOPD, check VSrc operand types"); if (Literal != Val) return errOperand(Val, "More than one unique literal is illegal"); } @@ -1367,6 +1523,20 @@ MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) c llvm_unreachable("unknown dst register"); } +// Bit 0 of DstY isn't stored in the instruction, because it's always the +// opposite of bit 0 of DstX. +MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst, + unsigned Val) const { + int VDstXInd = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdstX); + assert(VDstXInd != -1); + assert(Inst.getOperand(VDstXInd).isReg()); + unsigned XDstReg = MRI.getEncodingValue(Inst.getOperand(VDstXInd).getReg()); + Val |= ~XDstReg & 1; + auto Width = llvm::AMDGPUDisassembler::OPW32; + return createRegOperand(getVgprClassId(Width), Val); +} + MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { using namespace AMDGPU; @@ -1381,8 +1551,10 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { case 109: return createRegOperand(TBA_HI); case 110: return createRegOperand(TMA_LO); case 111: return createRegOperand(TMA_HI); - case 124: return createRegOperand(M0); - case 125: return createRegOperand(SGPR_NULL); + case 124: + return isGFX11Plus() ? createRegOperand(SGPR_NULL) : createRegOperand(M0); + case 125: + return isGFX11Plus() ? createRegOperand(M0) : createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); case 235: return createRegOperand(SRC_SHARED_BASE); @@ -1408,7 +1580,14 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 106: return createRegOperand(VCC); case 108: return createRegOperand(TBA); case 110: return createRegOperand(TMA); - case 125: return createRegOperand(SGPR_NULL); + case 124: + if (isGFX11Plus()) + return createRegOperand(SGPR_NULL); + break; + case 125: + if (!isGFX11Plus()) + return createRegOperand(SGPR_NULL); + break; case 126: return createRegOperand(EXEC); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); @@ -1522,6 +1701,15 @@ bool AMDGPUDisassembler::isGFX10Plus() const { return AMDGPU::isGFX10Plus(STI); } +bool AMDGPUDisassembler::isGFX11() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX11]; +} + +bool AMDGPUDisassembler::isGFX11Plus() const { + return AMDGPU::isGFX11Plus(STI); +} + + bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; } @@ -1888,10 +2076,10 @@ AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, //===----------------------------------------------------------------------===// // Try to find symbol name for specified label -bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, - raw_ostream &/*cStream*/, int64_t Value, - uint64_t /*Address*/, bool IsBranch, - uint64_t /*Offset*/, uint64_t /*InstSize*/) { +bool AMDGPUSymbolizer::tryAddingSymbolicOperand( + MCInst &Inst, raw_ostream & /*cStream*/, int64_t Value, + uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/, + uint64_t /*OpSize*/, uint64_t /*InstSize*/) { if (!IsBranch) { return false; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index eea6074d5281..31869f0917ae 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -15,8 +15,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H #define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H +#include "llvm/ADT/APInt.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCInst.h" #include "llvm/Support/DataExtractor.h" #include <memory> @@ -27,6 +29,60 @@ class MCOperand; class MCSubtargetInfo; class Twine; +// Exposes an interface expected by autogenerated code in +// FixedLenDecoderEmitter +class DecoderUInt128 { +private: + uint64_t Lo = 0; + uint64_t Hi = 0; + +public: + DecoderUInt128() = default; + DecoderUInt128(uint64_t Lo, uint64_t Hi = 0) : Lo(Lo), Hi(Hi) {} + operator bool() const { return Lo || Hi; } + void insertBits(uint64_t SubBits, unsigned BitPosition, unsigned NumBits) { + assert(NumBits && NumBits <= 64); + assert(SubBits >> 1 >> (NumBits - 1) == 0); + assert(BitPosition < 128); + if (BitPosition < 64) { + Lo |= SubBits << BitPosition; + Hi |= SubBits >> 1 >> (63 - BitPosition); + } else { + Hi |= SubBits << (BitPosition - 64); + } + } + uint64_t extractBitsAsZExtValue(unsigned NumBits, + unsigned BitPosition) const { + assert(NumBits && NumBits <= 64); + assert(BitPosition < 128); + uint64_t Val; + if (BitPosition < 64) + Val = Lo >> BitPosition | Hi << 1 << (63 - BitPosition); + else + Val = Hi >> (BitPosition - 64); + return Val & ((uint64_t(2) << (NumBits - 1)) - 1); + } + DecoderUInt128 operator&(const DecoderUInt128 &RHS) const { + return DecoderUInt128(Lo & RHS.Lo, Hi & RHS.Hi); + } + DecoderUInt128 operator&(const uint64_t &RHS) const { + return *this & DecoderUInt128(RHS); + } + DecoderUInt128 operator~() const { return DecoderUInt128(~Lo, ~Hi); } + bool operator==(const DecoderUInt128 &RHS) { + return Lo == RHS.Lo && Hi == RHS.Hi; + } + bool operator!=(const DecoderUInt128 &RHS) { + return Lo != RHS.Lo || Hi != RHS.Hi; + } + bool operator!=(const int &RHS) { + return *this != DecoderUInt128(RHS); + } + friend raw_ostream &operator<<(raw_ostream &OS, const DecoderUInt128 &RHS) { + return OS << APInt(128, {RHS.Lo, RHS.Hi}); + } +}; + //===----------------------------------------------------------------------===// // AMDGPUDisassembler //===----------------------------------------------------------------------===// @@ -57,8 +113,21 @@ public: MCOperand errOperand(unsigned V, const Twine& ErrMsg) const; - DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst, - uint64_t Address) const; + template <typename InsnType> + DecodeStatus tryDecodeInst(const uint8_t *Table, MCInst &MI, InsnType Inst, + uint64_t Address) const { + assert(MI.getOpcode() == 0); + assert(MI.getNumOperands() == 0); + MCInst TmpInst; + HasLiteral = false; + const auto SavedBytes = Bytes; + if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { + MI = TmpInst; + return MCDisassembler::Success; + } + Bytes = SavedBytes; + return MCDisassembler::Fail; + } Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes, @@ -87,10 +156,14 @@ public: DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, raw_string_ostream &KdStream) const; + DecodeStatus convertEXPInst(MCInst &MI) const; + DecodeStatus convertVINTERPInst(MCInst &MI) const; DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const; DecodeStatus convertSDWAInst(MCInst &MI) const; DecodeStatus convertDPP8Inst(MCInst &MI) const; DecodeStatus convertMIMGInst(MCInst &MI) const; + DecodeStatus convertVOP3PDPPInst(MCInst &MI) const; + DecodeStatus convertVOPCDPPInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const; @@ -127,6 +200,9 @@ public: MCOperand decodeOperand_AReg_1024(unsigned Val) const; MCOperand decodeOperand_AV_32(unsigned Val) const; MCOperand decodeOperand_AV_64(unsigned Val) const; + MCOperand decodeOperand_AV_128(unsigned Val) const; + MCOperand decodeOperand_AVDst_128(unsigned Val) const; + MCOperand decodeOperand_AVDst_512(unsigned Val) const; enum OpWidthTy { OPW32, @@ -157,6 +233,7 @@ public: MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val, bool MandatoryLiteral = false) const; MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; @@ -177,6 +254,8 @@ public: bool isGFX9Plus() const; bool isGFX10() const; bool isGFX10Plus() const; + bool isGFX11() const; + bool isGFX11Plus() const; bool hasArchitectedFlatScratch() const; }; @@ -196,8 +275,8 @@ public: : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {} bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream, - int64_t Value, uint64_t Address, - bool IsBranch, uint64_t Offset, + int64_t Value, uint64_t Address, bool IsBranch, + uint64_t Offset, uint64_t OpSize, uint64_t InstSize) override; void tryAddingPcLoadReferenceComment(raw_ostream &cStream, diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td index b3b55ddd2c97..14ba01f0d67c 100644 --- a/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -10,7 +10,7 @@ // EXP classes //===----------------------------------------------------------------------===// -class EXPCommon<bit done, string asm = ""> : InstSI< +class EXPCommon<bit row, bit done, string asm = ""> : InstSI< (outs), (ins exp_tgt:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3, @@ -21,21 +21,30 @@ class EXPCommon<bit done, string asm = ""> : InstSI< let mayLoad = done; let mayStore = 1; let UseNamedOperandTable = 1; - let Uses = [EXEC]; + let Uses = !if(row, [EXEC, M0], [EXEC]); let SchedRW = [WriteExport]; let DisableWQM = 1; } -class EXP_Pseudo<bit done> : EXPCommon<done>, - SIMCInstr <NAME, SIEncodingFamily.NONE> { +class EXP_Pseudo<bit row, bit done> + : EXPCommon<row, done>, SIMCInstr<NAME, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } -class EXP_Real<bit done, string pseudo, int subtarget> - : EXPCommon<done, "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "") - #"$compr$vm">, - SIMCInstr <pseudo, subtarget> { +// Real instruction with optional asm operands "compr" and "vm". +class EXP_Real_ComprVM<bit done, string pseudo, int subtarget> + : EXPCommon<0, done, "exp$tgt $src0, $src1, $src2, $src3" + #!if(done, " done", "")#"$compr$vm">, + SIMCInstr<pseudo, subtarget> { + let AsmMatchConverter = "cvtExp"; +} + +// Real instruction with optional asm operand "row_en". +class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget> + : EXPCommon<row, done, "exp$tgt $src0, $src1, $src2, $src3" + #!if(done, " done", "")#!if(row, " row_en", "")>, + SIMCInstr<pseudo, subtarget> { let AsmMatchConverter = "cvtExp"; } @@ -43,17 +52,21 @@ class EXP_Real<bit done, string pseudo, int subtarget> // EXP Instructions //===----------------------------------------------------------------------===// -// Split EXP instruction into EXP and EXP_DONE so we can set -// mayLoad for done=1. -def EXP : EXP_Pseudo<0>; -def EXP_DONE : EXP_Pseudo<1>; +// DONE variants have mayLoad = 1. +// ROW variants have an implicit use of M0. +let SubtargetPredicate = isNotGFX90APlus in { +def EXP : EXP_Pseudo<0, 0>; +def EXP_DONE : EXP_Pseudo<0, 1>; +def EXP_ROW : EXP_Pseudo<1, 0>; +def EXP_ROW_DONE : EXP_Pseudo<1, 1>; +} // let SubtargetPredicate = isNotGFX90APlus //===----------------------------------------------------------------------===// // SI //===----------------------------------------------------------------------===// class EXP_Real_si<bit _done, string pseudo> - : EXP_Real<_done, pseudo, SIEncodingFamily.SI>, EXPe { + : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.SI>, EXPe_ComprVM { let AssemblerPredicate = isGFX6GFX7; let DecoderNamespace = "GFX6GFX7"; let done = _done; @@ -67,8 +80,9 @@ def EXP_DONE_si : EXP_Real_si<1, "EXP_DONE">; //===----------------------------------------------------------------------===// class EXP_Real_vi<bit _done, string pseudo> - : EXP_Real<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi { + : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.VI>, EXPe_vi { let AssemblerPredicate = isGFX8GFX9; + let SubtargetPredicate = isNotGFX90APlus; let DecoderNamespace = "GFX8"; let done = _done; } @@ -77,12 +91,12 @@ def EXP_vi : EXP_Real_vi<0, "EXP">; def EXP_DONE_vi : EXP_Real_vi<1, "EXP_DONE">; //===----------------------------------------------------------------------===// -// GFX10+ +// GFX10 //===----------------------------------------------------------------------===// class EXP_Real_gfx10<bit _done, string pseudo> - : EXP_Real<_done, pseudo, SIEncodingFamily.GFX10>, EXPe { - let AssemblerPredicate = isGFX10Plus; + : EXP_Real_ComprVM<_done, pseudo, SIEncodingFamily.GFX10>, EXPe_ComprVM { + let AssemblerPredicate = isGFX10Only; let DecoderNamespace = "GFX10"; let done = _done; } @@ -91,6 +105,23 @@ def EXP_gfx10 : EXP_Real_gfx10<0, "EXP">; def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">; //===----------------------------------------------------------------------===// +// GFX11+ +//===----------------------------------------------------------------------===// + +class EXP_Real_gfx11<bit _row, bit _done, string pseudo> + : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX11>, EXPe_Row { + let AssemblerPredicate = isGFX11Plus; + let DecoderNamespace = "GFX11"; + let row = _row; + let done = _done; +} + +def EXP_gfx11 : EXP_Real_gfx11<0, 0, "EXP">; +def EXP_DONE_gfx11 : EXP_Real_gfx11<0, 1, "EXP_DONE">; +def EXP_ROW_gfx11 : EXP_Real_gfx11<1, 0, "EXP_ROW">; +def EXP_ROW_DONE_gfx11 : EXP_Real_gfx11<1, 1, "EXP_ROW_DONE">; + +//===----------------------------------------------------------------------===// // EXP Patterns //===----------------------------------------------------------------------===// @@ -103,6 +134,15 @@ class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) >; +class ExpRowPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< + (int_amdgcn_exp_row timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), + done_val, M0), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + ExpSrc2:$src2, ExpSrc3:$src3, 0, 0, timm:$en) +>; + class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< (int_amdgcn_exp_compr timm:$tgt, timm:$en, (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), @@ -119,6 +159,11 @@ def : ExpPattern<i32, EXP_DONE, -1>; def : ExpPattern<f32, EXP, 0>; def : ExpPattern<f32, EXP_DONE, -1>; +def : ExpRowPattern<i32, EXP_ROW, 0>; +def : ExpRowPattern<i32, EXP_ROW_DONE, -1>; +def : ExpRowPattern<f32, EXP_ROW, 0>; +def : ExpRowPattern<f32, EXP_ROW_DONE, -1>; + def : ExpComprPattern<v2i16, EXP, 0>; def : ExpComprPattern<v2i16, EXP_DONE, -1>; def : ExpComprPattern<v2f16, EXP, 0>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c530d3cb49f0..cb2822818549 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -12,6 +12,7 @@ def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWant def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; +def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [SDNPWantRoot], -10>; //===----------------------------------------------------------------------===// // FLAT classes @@ -56,6 +57,9 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, bits<1> dlcValue = 0; bits<1> has_sccb = 1; bits<1> sccbValue = 0; + bits<1> has_sve = 0; // Scratch VGPR Enable + bits<1> lds = 0; + bits<1> sve = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -74,8 +78,8 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, let FlatScratch = is_flat_scratch; } -class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, +class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>, Enc64 { let isPseudo = 0; @@ -96,6 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let IsAtomicNoRet = ps.IsAtomicNoRet; let VM_CNT = ps.VM_CNT; let LGKM_CNT = ps.LGKM_CNT; + let VALU = ps.VALU; // encoding fields bits<8> vaddr; @@ -106,7 +111,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : bits<5> cpol; // Only valid on gfx9 - bits<1> lds = 0; // XXX - What does this actually do? + bits<1> lds = ps.lds; // LDS DMA for global and scratch // Segment, 00=flat, 01=scratch, 10=global, 11=reserved bits<2> seg = !if(ps.is_flat_global, 0b10, @@ -123,7 +128,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // Only valid on GFX9+ let Inst{12-0} = offset; - let Inst{13} = lds; + let Inst{13} = !if(ps.has_sve, ps.sve, lds); let Inst{15-14} = seg; let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue); @@ -240,6 +245,35 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { } } +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 0; + let has_vdst = 0; + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = [M0, EXEC]; + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + GlobalSaddrTable<1, opName>; +} + class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo< opName, @@ -273,16 +307,19 @@ class FlatScratchInst <string sv_op, string mode> { class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, bit HasTiedOutput = 0, bit EnableSaddr = 0, - bit EnableVaddr = !not(EnableSaddr)> + bit EnableSVE = 0, + bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo< opName, (outs getLdStRegisterOperand<regClass>.ret:$vdst), !con( - !if(EnableSaddr, - (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), - !if(EnableVaddr, - (ins VGPR_32:$vaddr, flat_offset:$offset), - (ins flat_offset:$offset))), + !if(EnableSVE, + (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), + !if(EnableSaddr, + (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), + !if(EnableVaddr, + (ins VGPR_32:$vaddr, flat_offset:$offset), + (ins flat_offset:$offset)))), !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in), (ins CPol_0:$cpol))), " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { @@ -291,7 +328,9 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, let has_saddr = 1; let enabled_saddr = EnableSaddr; let has_vaddr = EnableVaddr; - let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")); + let has_sve = EnableSVE; + let sve = EnableVaddr; + let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); let maybeAtomic = 1; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); @@ -299,15 +338,18 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, } class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0, - bit EnableVaddr = !not(EnableSaddr), + bit EnableSVE = 0, + bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr)), RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret> : FLAT_Pseudo< opName, (outs), - !if(EnableSaddr, - (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), - !if(EnableVaddr, - (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol), - (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol))), + !if(EnableSVE, + (ins vdata_op:$vdata, VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), + !if(EnableSaddr, + (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol_0:$cpol), + !if(EnableVaddr, + (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol), + (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))), " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { let mayLoad = 0; let mayStore = 1; @@ -315,7 +357,9 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En let has_saddr = 1; let enabled_saddr = EnableSaddr; let has_vaddr = EnableVaddr; - let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST")); + let has_sve = EnableSVE; + let sve = EnableVaddr; + let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); let maybeAtomic = 1; } @@ -326,8 +370,12 @@ multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit H def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>, FlatScratchInst<opName, "SS">; + let SubtargetPredicate = HasFlatScratchSVSMode in + def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>, + FlatScratchInst<opName, "SVS">; + let SubtargetPredicate = HasFlatScratchSTMode in - def _ST : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>, + def _ST : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>, FlatScratchInst<opName, "ST">; } } @@ -339,12 +387,59 @@ multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> { def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>, FlatScratchInst<opName, "SS">; + let SubtargetPredicate = HasFlatScratchSVSMode in + def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>, + FlatScratchInst<opName, "SVS">; + let SubtargetPredicate = HasFlatScratchSTMode in - def _ST : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>, + def _ST : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>, FlatScratchInst<opName, "ST">; } } +class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, + bit EnableSVE = 0, + bit EnableVaddr = !or(EnableSVE, !not(EnableSaddr))> : FLAT_Pseudo< + opName, + (outs ), + !if(EnableSVE, + (ins VGPR_32:$vaddr, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol), + !if(EnableSaddr, + (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, CPol:$cpol), + !if(EnableVaddr, + (ins VGPR_32:$vaddr, flat_offset:$offset, CPol:$cpol), + (ins flat_offset:$offset, CPol:$cpol)))), + " "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> { + + let LGKM_CNT = 1; + let is_flat_scratch = 1; + let lds = 1; + let has_data = 0; + let has_vdst = 0; + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let has_vaddr = EnableVaddr; + let has_sve = EnableSVE; + let sve = EnableVaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); + let Uses = [M0, EXEC]; + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Scratch_Load_LDS_Pseudo<string opName> { + def "" : FLAT_Scratch_Load_LDS_Pseudo<opName>, + FlatScratchInst<opName, "SV">; + def _SADDR : FLAT_Scratch_Load_LDS_Pseudo<opName, 1>, + FlatScratchInst<opName, "SS">; + def _SVS : FLAT_Scratch_Load_LDS_Pseudo<opName, 1, 1>, + FlatScratchInst<opName, "SVS">; + def _ST : FLAT_Scratch_Load_LDS_Pseudo<opName, 0, 0, 0>, + FlatScratchInst<opName, "ST">; +} + class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins, string asm, list<dag> pattern = []> : FLAT_Pseudo<opName, outs, ins, asm, pattern> { @@ -375,7 +470,6 @@ multiclass FLAT_Atomic_Pseudo< string opName, RegisterClass vdst_rc, ValueType vt, - SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, bit isFP = isFloatType<data_vt>.ret, @@ -394,11 +488,9 @@ multiclass FLAT_Atomic_Pseudo< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), - " $vdst, $vaddr, $vdata$offset$cpol", - [(set vt:$vdst, - (atomic (FlatOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, - GlobalSaddrTable<0, opName#"_rtn">, - AtomicNoRet <opName, 1>{ + " $vdst, $vaddr, $vdata$offset$cpol">, + GlobalSaddrTable<0, opName#"_rtn">, + AtomicNoRet <opName, 1> { let FPAtomic = isFP; let AddedComplexity = -1; // Prefer global atomics if available } @@ -441,7 +533,6 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< string opName, RegisterClass vdst_rc, ValueType vt, - SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, bit isFP = isFloatType<data_vt>.ret, @@ -451,11 +542,9 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), - " $vdst, $vaddr, $vdata, off$offset$cpol", - [(set vt:$vdst, - (atomic (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, - GlobalSaddrTable<0, opName#"_rtn">, - AtomicNoRet <opName, 1> { + " $vdst, $vaddr, $vdata, off$offset$cpol">, + GlobalSaddrTable<0, opName#"_rtn">, + AtomicNoRet <opName, 1> { let has_saddr = 1; let FPAtomic = isFP; } @@ -477,12 +566,11 @@ multiclass FLAT_Global_Atomic_Pseudo< string opName, RegisterClass vdst_rc, ValueType vt, - SDPatternOperator atomic_rtn = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc> { let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>; - defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>; + defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>; } } @@ -519,99 +607,88 @@ def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR } defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", - VGPR_32, i32, AMDGPUatomic_cmp_swap_flat_32, - v2i32, VReg_64>; + VGPR_32, i32, v2i32, VReg_64>; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2", - VReg_64, i64, AMDGPUatomic_cmp_swap_flat_64, - v2i64, VReg_128>; + VReg_64, i64, v2i64, VReg_128>; defm FLAT_ATOMIC_SWAP : FLAT_Atomic_Pseudo <"flat_atomic_swap", - VGPR_32, i32, atomic_swap_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_SWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2", - VReg_64, i64, atomic_swap_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_ADD : FLAT_Atomic_Pseudo <"flat_atomic_add", - VGPR_32, i32, atomic_load_add_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_SUB : FLAT_Atomic_Pseudo <"flat_atomic_sub", - VGPR_32, i32, atomic_load_sub_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_SMIN : FLAT_Atomic_Pseudo <"flat_atomic_smin", - VGPR_32, i32, atomic_load_min_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_UMIN : FLAT_Atomic_Pseudo <"flat_atomic_umin", - VGPR_32, i32, atomic_load_umin_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_SMAX : FLAT_Atomic_Pseudo <"flat_atomic_smax", - VGPR_32, i32, atomic_load_max_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_UMAX : FLAT_Atomic_Pseudo <"flat_atomic_umax", - VGPR_32, i32, atomic_load_umax_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_AND : FLAT_Atomic_Pseudo <"flat_atomic_and", - VGPR_32, i32, atomic_load_and_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_OR : FLAT_Atomic_Pseudo <"flat_atomic_or", - VGPR_32, i32, atomic_load_or_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_XOR : FLAT_Atomic_Pseudo <"flat_atomic_xor", - VGPR_32, i32, atomic_load_xor_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_INC : FLAT_Atomic_Pseudo <"flat_atomic_inc", - VGPR_32, i32, atomic_inc_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_DEC : FLAT_Atomic_Pseudo <"flat_atomic_dec", - VGPR_32, i32, atomic_dec_flat_32>; + VGPR_32, i32>; defm FLAT_ATOMIC_ADD_X2 : FLAT_Atomic_Pseudo <"flat_atomic_add_x2", - VReg_64, i64, atomic_load_add_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_SUB_X2 : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2", - VReg_64, i64, atomic_load_sub_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_SMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2", - VReg_64, i64, atomic_load_min_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_UMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2", - VReg_64, i64, atomic_load_umin_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_SMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2", - VReg_64, i64, atomic_load_max_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_UMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2", - VReg_64, i64, atomic_load_umax_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_AND_X2 : FLAT_Atomic_Pseudo <"flat_atomic_and_x2", - VReg_64, i64, atomic_load_and_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_OR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_or_x2", - VReg_64, i64, atomic_load_or_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_XOR_X2 : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2", - VReg_64, i64, atomic_load_xor_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_INC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2", - VReg_64, i64, atomic_inc_flat_64>; + VReg_64, i64>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2", - VReg_64, i64, atomic_dec_flat_64>; + VReg_64, i64>; // GFX7-, GFX10-only flat instructions. let SubtargetPredicate = isGFX7GFX10 in { -defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", - VGPR_32, f32, null_frag, v2f32, VReg_64>; - defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2", - VReg_64, f64, null_frag, v2f64, VReg_128>; - -defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin", - VGPR_32, f32>; - -defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", - VGPR_32, f32>; + VReg_64, f64, v2f64, VReg_128>; defm FLAT_ATOMIC_FMIN_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2", VReg_64, f64>; @@ -622,14 +699,39 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isGFX7GFX10 let SubtargetPredicate = isGFX90APlus in { - defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>; - defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>; - defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>; - defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; - defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; - defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = isGFX940Plus in { + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>; +} // End SubtargetPredicate = isGFX940Plus + +// GFX7-, GFX10-, GFX11-only flat instructions. +let SubtargetPredicate = isGFX7GFX10GFX11 in { + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap", + VGPR_32, f32, v2f32, VReg_64>; + +defm FLAT_ATOMIC_FMIN : FLAT_Atomic_Pseudo <"flat_atomic_fmin", + VGPR_32, f32>; + +defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", + VGPR_32, f32>; + +} // End SubtargetPredicate = isGFX7GFX10GFX11 + +// GFX940-, GFX11-only flat instructions. +let SubtargetPredicate = isGFX940GFX11Plus in { + defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; +} // End SubtargetPredicate = isGFX940GFX11Plus + defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -662,88 +764,93 @@ defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d let is_flat_global = 1 in { defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap", - VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32, - v2i32, VReg_64>; + VGPR_32, i32, v2i32, VReg_64>; defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2", - VReg_64, i64, AMDGPUatomic_cmp_swap_global_64, - v2i64, VReg_128>; + VReg_64, i64, v2i64, VReg_128>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap", - VGPR_32, i32, atomic_swap_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2", - VReg_64, i64, atomic_swap_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add", - VGPR_32, i32, atomic_load_add_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub", - VGPR_32, i32, atomic_load_sub_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin", - VGPR_32, i32, atomic_load_min_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin", - VGPR_32, i32, atomic_load_umin_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax", - VGPR_32, i32, atomic_load_max_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax", - VGPR_32, i32, atomic_load_umax_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and", - VGPR_32, i32, atomic_load_and_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or", - VGPR_32, i32, atomic_load_or_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor", - VGPR_32, i32, atomic_load_xor_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc", - VGPR_32, i32, atomic_inc_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec", - VGPR_32, i32, atomic_dec_global_32>; + VGPR_32, i32>; defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2", - VReg_64, i64, atomic_load_add_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2", - VReg_64, i64, atomic_load_sub_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2", - VReg_64, i64, atomic_load_min_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2", - VReg_64, i64, atomic_load_umin_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2", - VReg_64, i64, atomic_load_max_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2", - VReg_64, i64, atomic_load_umax_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2", - VReg_64, i64, atomic_load_and_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2", - VReg_64, i64, atomic_load_or_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2", - VReg_64, i64, atomic_load_xor_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", - VReg_64, i64, atomic_inc_global_64>; + VReg_64, i64>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", - VReg_64, i64, atomic_dec_global_64>; + VReg_64, i64>; let SubtargetPredicate = HasGFX10_BEncoding in defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", - VGPR_32, i32, int_amdgcn_global_atomic_csub>; + VGPR_32, i32>; + +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; + } // End is_flat_global = 1 @@ -775,41 +882,46 @@ defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; +defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">; +defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">; +defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ushort">; +defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">; +defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">; + } // End SubtargetPredicate = HasFlatScratchInsts let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { defm GLOBAL_ATOMIC_FCMPSWAP : - FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>; defm GLOBAL_ATOMIC_FMIN : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>; defm GLOBAL_ATOMIC_FMAX : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, v2f64, VReg_128>; defm GLOBAL_ATOMIC_FMIN_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>; defm GLOBAL_ATOMIC_FMAX_X2 : - FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>; + FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 let is_flat_global = 1 in { -let OtherPredicates = [HasAtomicFaddInsts] in { +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", VGPR_32, f32 >; +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; -} // End OtherPredicates = [HasAtomicFaddInsts] - -let OtherPredicates = [isGFX90APlus] in { +let OtherPredicates = [HasAtomicFaddRtnInsts] in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd + "global_atomic_add_f32", VGPR_32, f32 >; +let OtherPredicates = [isGFX90APlus] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd + "global_atomic_pk_add_f16", VGPR_32, v2f16 >; -} // End OtherPredicates = [isGFX90APlus] } // End is_flat_global = 1 //===----------------------------------------------------------------------===// @@ -896,24 +1008,47 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; -class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, - ValueType data_vt = vt> : GCNPat < - (vt (node (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst $vaddr, $data, $offset) ->; - class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data), (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; +multiclass FlatAtomicPat <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size); + defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); + + def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + + def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; +} + +multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> { + defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + + def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + + def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; +} + +multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>; +} + class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data), (inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; -class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, - ValueType data_vt = vt> : GCNPat < +class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, + ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; @@ -949,8 +1084,28 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; +class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset))), + (inst $vaddr, $saddr, $offset, 0) +>; + +class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, + ValueType vt> : GCNPat < + (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset)), + (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset) +>; + +class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), + (inst $vaddr, $saddr, $offset, 0, $in) +>; + let OtherPredicates = [HasFlatAddressSpace] in { +def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>; def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; @@ -986,44 +1141,52 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; +def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; +foreach as = [ "flat", "global" ] in { +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN", "atomic_load_min_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN", "atomic_load_umin_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_OR", "atomic_load_or_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP", "atomic_swap_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_"#as, i32, v2i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_XOR", "atomic_load_xor_"#as, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>; -def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global_64, i64, v2i64>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SMIN_X2", "atomic_load_min_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_UMIN_X2", "atomic_load_umin_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_OR_X2", "atomic_load_or_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_SWAP_X2", "atomic_swap_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64, v2i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; +} // end foreach as def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; +} +let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; @@ -1084,9 +1247,9 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V } } -multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node, - ValueType vt, ValueType data_vt = vt> { - def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> { +multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node, + ValueType vt, ValueType data_vt = vt> { + def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> { let AddedComplexity = 10; } @@ -1095,6 +1258,26 @@ multiclass GlobalFLATAtomicPats<string nortn_inst_name, SDPatternOperator node, } } +multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, + ValueType data_vt = vt, bit isIntr = 0> { + defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + + let AddedComplexity = 10 in { + defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>; + } + + let AddedComplexity = 11 in { + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; + } +} + +multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>; +} + multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : FlatSignedAtomicPatNoRtn <inst, node, vt> { @@ -1114,6 +1297,11 @@ multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTy def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 26; } + + def : ScratchLoadSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } } multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, @@ -1125,6 +1313,11 @@ multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 26; } + + def : ScratchStoreSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } } multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { @@ -1135,10 +1328,19 @@ multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Val def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 26; } + + def : ScratchLoadSVaddrPat_D16 <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } } let OtherPredicates = [HasFlatGlobalInsts] in { +defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>; @@ -1179,10 +1381,12 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>; -let OtherPredicates = [D16PreservesUnusedBits] in { +let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; +} +let OtherPredicates = [D16PreservesUnusedBits] in { defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>; defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>; defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>; @@ -1198,59 +1402,84 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16> defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; } -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>; -defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; +defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", "atomic_load_min_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", "atomic_load_umin_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", "atomic_load_min_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", "atomic_load_umin_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", "atomic_load_or_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; let OtherPredicates = [isGFX10Plus] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", atomic_load_fmin_global_32, f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", atomic_load_fmax_global_32, f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", atomic_load_fmin_global_64, f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", atomic_load_fmax_global_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; } -let OtherPredicates = [HasAtomicFaddInsts] in { +let OtherPredicates = [HasAtomicFaddNoRtnInsts] in defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_ADD_F32, atomic_load_fadd_global_noret_32, f32>; +let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in defm : GlobalFLATNoRtnAtomicPats <GLOBAL_ATOMIC_PK_ADD_F16, atomic_load_fadd_v2f16_global_noret_32, v2f16>; -} let OtherPredicates = [isGFX90APlus] in { -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>; -def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_64, f64>; -def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_64, f64>; -def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_global", v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; +} + +let OtherPredicates = [isGFX940Plus] in { +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; } } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 @@ -1291,10 +1520,12 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>; defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>; defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>; -let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { +let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>; defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>; +} +let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>; defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>; defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>; @@ -1405,6 +1636,57 @@ multiclass FLAT_Real_AllAddr_vi<bits<7> op, def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>; } +class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> : + FLAT_Real <op, ps>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> { + let AssemblerPredicate = isGFX940Plus; + let DecoderNamespace = "GFX9"; + let Inst{13} = ps.sve; + let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue); +} + +multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> { + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> { + let AssemblerPredicate = isGFX8GFX9NotGFX940; + let OtherPredicates = [isGFX8GFX9NotGFX940]; + } + def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> { + let DecoderNamespace = "GFX9"; + } + let AssemblerPredicate = isGFX940Plus, SubtargetPredicate = isGFX940Plus in { + def _VE_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>; + def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>; + def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>; + } +} + +multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op, + string pre_gfx940_name = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr), + bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> { + + let OtherPredicates = [isGFX8GFX9NotGFX940] in { + def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> { + let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds"; + } + def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> { + let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds"; + } + } + + let SubtargetPredicate = isGFX940Plus in { + def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>; + def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; + } +} + +multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> { + defm "" : FLAT_Real_AllAddr_LDS<op, pre_gfx940_op>; + let SubtargetPredicate = isGFX940Plus in { + def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>; + def _ST_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>; + } +} + def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; @@ -1496,6 +1778,11 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS <0x026, 0x10>; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS <0x027, 0x11>; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>; defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>; @@ -1524,32 +1811,39 @@ defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Real_Atomics_vi <0x6a>; defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Real_Atomics_vi <0x6b>; defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Real_Atomics_vi <0x6c>; -defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>; -defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>; -defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>; -defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>; -defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>; -defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>; -defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>; -defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>; -defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>; -defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>; -defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>; -defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>; -defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>; -defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>; -defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>; -defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>; -defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>; -defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>; -defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>; -defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; -defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; -defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; +defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_SVE_LDS <0x026, 0x10>; +defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_SVE_LDS <0x027, 0x11>; +defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_AllAddr_SVE_LDS <0x028, 0x12>; +defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_SVE_LDS <0x029, 0x13>; +defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_AllAddr_SVE_LDS <0x02a, 0x14>; + +defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_SVE_vi <0x10>; +defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_SVE_vi <0x11>; +defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_SVE_vi <0x12>; +defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_SVE_vi <0x13>; +defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_SVE_vi <0x14>; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_SVE_vi <0x15>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x16>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x17>; +defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_SVE_vi <0x18>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x19>; +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_SVE_vi <0x20>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x21>; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_SVE_vi <0x22>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x23>; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_SVE_vi <0x24>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x25>; +defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_SVE_vi <0x1a>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_SVE_vi <0x1b>; +defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_SVE_vi <0x1c>; +defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_SVE_vi <0x1d>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>; -let SubtargetPredicate = HasAtomicFaddInsts in { -defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; +let SubtargetPredicate = isGFX8GFX9NotGFX940 in { + // These instructions are encoded differently on gfx90* and gfx940. + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } let SubtargetPredicate = isGFX90AOnly in { @@ -1561,13 +1855,46 @@ let SubtargetPredicate = isGFX90AOnly in { defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>; } // End SubtargetPredicate = isGFX90AOnly +multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> { + def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>; + def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; +} + +multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> { + def _gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; + def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; +} + +multiclass FLAT_Global_Real_Atomics_gfx940<bits<7> op> : + FLAT_Real_AllAddr_gfx940<op> { + def _RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; + def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; +} + +let SubtargetPredicate = isGFX940Plus in { + // These instructions are encoded differently on gfx90* and gfx940. + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>; + + defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>; + defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>; +} // End SubtargetPredicate = isGFX940Plus + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> : FLAT_Real<op, ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10> { - let AssemblerPredicate = isGFX10Plus; + let AssemblerPredicate = isGFX10Only; let DecoderNamespace = "GFX10"; let Inst{11-0} = offset{11-0}; @@ -1627,6 +1954,23 @@ multiclass FLAT_Real_ScratchAllAddr_gfx10<bits<7> op> : FLAT_Real_SADDR_gfx10<op>, FLAT_Real_ST_gfx10<op>; +multiclass FLAT_Real_AllAddr_LDS_gfx10<bits<7> op, + string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> { + let AsmString = opname # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in + defm "" : FLAT_Real_Base_gfx10<op>; + + let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in + defm "" : FLAT_Real_SADDR_gfx10<op>; +} + +multiclass FLAT_Real_ScratchAllAddr_LDS_gfx10<bits<7> op, + string opname = !subst("_lds", "", !cast<FLAT_Pseudo>(NAME).PseudoInstr)> { + defm "" : FLAT_Real_AllAddr_LDS_gfx10<op>; + + let AsmString = opname # !cast<FLAT_Pseudo>(NAME#"_ST").AsmOperands # " lds" in + defm "" : FLAT_Real_ST_gfx10<op>; +} + // ENC_FLAT. defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; defm FLAT_LOAD_SBYTE : FLAT_Real_Base_gfx10<0x009>; @@ -1743,6 +2087,12 @@ defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x016>; defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_AllAddr_gfx10<0x017>; +defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x008>; +defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Real_AllAddr_LDS_gfx10 <0x009>; +defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00a>; +defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS_gfx10 <0x00b>; +defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS_gfx10 <0x00c>; + // ENC_FLAT_SCRATCH. defm SCRATCH_LOAD_UBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x008>; defm SCRATCH_LOAD_SBYTE : FLAT_Real_ScratchAllAddr_gfx10<0x009>; @@ -1766,3 +2116,219 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x022>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x023>; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_ScratchAllAddr_gfx10<0x024>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_ScratchAllAddr_gfx10<0x025>; + +defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x008>; +defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x009>; +defm SCRATCH_LOAD_LDS_USHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00a>; +defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00b>; +defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>; + +//===----------------------------------------------------------------------===// +// GFX11 +//===----------------------------------------------------------------------===// + +class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : + FLAT_Real <op, ps, opName>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> { + let AssemblerPredicate = isGFX11Plus; + let DecoderNamespace = "GFX11"; + + let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue); + let Inst{15} = cpol{CPolBit.SLC}; + let Inst{17-16} = seg; + let Inst{55} = ps.sve; +} + +multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> { + def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> { + let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + } + if renamed then + def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>; +} + +multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> { + def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> { + let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + } +} + +multiclass FLAT_Real_SADDR_gfx11<bits<7> op, string ps, string opName> { + def _SADDR_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>; +} + +multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> { + def _SADDR_RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>; +} + +multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> { + def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> { + let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); + let OtherPredicates = [HasFlatScratchSTMode]; + } +} + +multiclass FLAT_Real_SVS_gfx11<bits<7> op, string ps, string opName> { + def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> { + let OtherPredicates = [HasFlatScratchSVSMode]; + } +} + +multiclass FLAT_Real_AllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Real_Base_gfx11<op, ps, opName, renamed>, + FLAT_Real_SADDR_gfx11<op, ps, opName>; + +multiclass FLAT_Real_Atomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Real_Base_gfx11<op, ps, opName, renamed>, + FLAT_Real_RTN_gfx11<op, ps, opName>; + +multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Real_AllAddr_gfx11<op, ps, opName, renamed>, + FLAT_Real_RTN_gfx11<op, ps, opName>, + FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>; + +multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName> : + FLAT_Real_RTN_gfx11<op, ps, opName>, + FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>; + +multiclass FLAT_Real_ScratchAllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Real_Base_gfx11<op, ps, opName, renamed>, + FLAT_Real_SADDR_gfx11<op, ps, opName>, + FLAT_Real_ST_gfx11<op, ps, opName>, + FLAT_Real_SVS_gfx11<op, ps, opName>; + +// ENC_FLAT. +defm FLAT_LOAD_U8 : FLAT_Real_Base_gfx11<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>; +defm FLAT_LOAD_I8 : FLAT_Real_Base_gfx11<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>; +defm FLAT_LOAD_U16 : FLAT_Real_Base_gfx11<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>; +defm FLAT_LOAD_I16 : FLAT_Real_Base_gfx11<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>; +defm FLAT_LOAD_B32 : FLAT_Real_Base_gfx11<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>; +defm FLAT_LOAD_B64 : FLAT_Real_Base_gfx11<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>; +defm FLAT_LOAD_B96 : FLAT_Real_Base_gfx11<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>; +defm FLAT_LOAD_B128 : FLAT_Real_Base_gfx11<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>; +defm FLAT_STORE_B8 : FLAT_Real_Base_gfx11<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>; +defm FLAT_STORE_B16 : FLAT_Real_Base_gfx11<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>; +defm FLAT_STORE_B32 : FLAT_Real_Base_gfx11<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>; +defm FLAT_STORE_B64 : FLAT_Real_Base_gfx11<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>; +defm FLAT_STORE_B96 : FLAT_Real_Base_gfx11<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>; +defm FLAT_STORE_B128 : FLAT_Real_Base_gfx11<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>; +defm FLAT_LOAD_D16_U8 : FLAT_Real_Base_gfx11<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">; +defm FLAT_LOAD_D16_I8 : FLAT_Real_Base_gfx11<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">; +defm FLAT_LOAD_D16_B16 : FLAT_Real_Base_gfx11<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">; +defm FLAT_LOAD_D16_HI_U8 : FLAT_Real_Base_gfx11<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">; +defm FLAT_LOAD_D16_HI_I8 : FLAT_Real_Base_gfx11<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">; +defm FLAT_LOAD_D16_HI_B16 : FLAT_Real_Base_gfx11<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">; +defm FLAT_STORE_D16_HI_B8 : FLAT_Real_Base_gfx11<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">; +defm FLAT_STORE_D16_HI_B16 : FLAT_Real_Base_gfx11<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">; +defm FLAT_ATOMIC_SWAP_B32 : FLAT_Real_Atomics_gfx11<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>; +defm FLAT_ATOMIC_CMPSWAP_B32 : FLAT_Real_Atomics_gfx11<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>; +defm FLAT_ATOMIC_ADD_U32 : FLAT_Real_Atomics_gfx11<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>; +defm FLAT_ATOMIC_SUB_U32 : FLAT_Real_Atomics_gfx11<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>; +defm FLAT_ATOMIC_MIN_I32 : FLAT_Real_Atomics_gfx11<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>; +defm FLAT_ATOMIC_MIN_U32 : FLAT_Real_Atomics_gfx11<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>; +defm FLAT_ATOMIC_MAX_I32 : FLAT_Real_Atomics_gfx11<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>; +defm FLAT_ATOMIC_MAX_U32 : FLAT_Real_Atomics_gfx11<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>; +defm FLAT_ATOMIC_AND_B32 : FLAT_Real_Atomics_gfx11<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>; +defm FLAT_ATOMIC_OR_B32 : FLAT_Real_Atomics_gfx11<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>; +defm FLAT_ATOMIC_XOR_B32 : FLAT_Real_Atomics_gfx11<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>; +defm FLAT_ATOMIC_INC_U32 : FLAT_Real_Atomics_gfx11<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>; +defm FLAT_ATOMIC_DEC_U32 : FLAT_Real_Atomics_gfx11<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>; +defm FLAT_ATOMIC_SWAP_B64 : FLAT_Real_Atomics_gfx11<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>; +defm FLAT_ATOMIC_CMPSWAP_B64 : FLAT_Real_Atomics_gfx11<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>; +defm FLAT_ATOMIC_ADD_U64 : FLAT_Real_Atomics_gfx11<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>; +defm FLAT_ATOMIC_SUB_U64 : FLAT_Real_Atomics_gfx11<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>; +defm FLAT_ATOMIC_MIN_I64 : FLAT_Real_Atomics_gfx11<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>; +defm FLAT_ATOMIC_MIN_U64 : FLAT_Real_Atomics_gfx11<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>; +defm FLAT_ATOMIC_MAX_I64 : FLAT_Real_Atomics_gfx11<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>; +defm FLAT_ATOMIC_MAX_U64 : FLAT_Real_Atomics_gfx11<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>; +defm FLAT_ATOMIC_AND_B64 : FLAT_Real_Atomics_gfx11<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>; +defm FLAT_ATOMIC_OR_B64 : FLAT_Real_Atomics_gfx11<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>; +defm FLAT_ATOMIC_XOR_B64 : FLAT_Real_Atomics_gfx11<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>; +defm FLAT_ATOMIC_INC_U64 : FLAT_Real_Atomics_gfx11<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>; +defm FLAT_ATOMIC_DEC_U64 : FLAT_Real_Atomics_gfx11<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>; +defm FLAT_ATOMIC_CMPSWAP_F32 : FLAT_Real_Atomics_gfx11<0x050, "FLAT_ATOMIC_FCMPSWAP", "flat_atomic_cmpswap_f32">; +defm FLAT_ATOMIC_MIN_F32 : FLAT_Real_Atomics_gfx11<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_f32">; +defm FLAT_ATOMIC_MAX_F32 : FLAT_Real_Atomics_gfx11<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_f32">; +defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_gfx11<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; + +// ENC_FLAT_GLBL. +defm GLOBAL_LOAD_U8 : FLAT_Real_AllAddr_gfx11<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>; +defm GLOBAL_LOAD_I8 : FLAT_Real_AllAddr_gfx11<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>; +defm GLOBAL_LOAD_U16 : FLAT_Real_AllAddr_gfx11<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>; +defm GLOBAL_LOAD_I16 : FLAT_Real_AllAddr_gfx11<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>; +defm GLOBAL_LOAD_B32 : FLAT_Real_AllAddr_gfx11<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>; +defm GLOBAL_LOAD_B64 : FLAT_Real_AllAddr_gfx11<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>; +defm GLOBAL_LOAD_B96 : FLAT_Real_AllAddr_gfx11<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>; +defm GLOBAL_LOAD_B128 : FLAT_Real_AllAddr_gfx11<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>; +defm GLOBAL_STORE_B8 : FLAT_Real_AllAddr_gfx11<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>; +defm GLOBAL_STORE_B16 : FLAT_Real_AllAddr_gfx11<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>; +defm GLOBAL_STORE_B32 : FLAT_Real_AllAddr_gfx11<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>; +defm GLOBAL_STORE_B64 : FLAT_Real_AllAddr_gfx11<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>; +defm GLOBAL_STORE_B96 : FLAT_Real_AllAddr_gfx11<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>; +defm GLOBAL_STORE_B128 : FLAT_Real_AllAddr_gfx11<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>; +defm GLOBAL_LOAD_D16_U8 : FLAT_Real_AllAddr_gfx11<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">; +defm GLOBAL_LOAD_D16_I8 : FLAT_Real_AllAddr_gfx11<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">; +defm GLOBAL_LOAD_D16_B16 : FLAT_Real_AllAddr_gfx11<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">; +defm GLOBAL_LOAD_D16_HI_U8 : FLAT_Real_AllAddr_gfx11<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_D16_HI_I8 : FLAT_Real_AllAddr_gfx11<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_D16_HI_B16 : FLAT_Real_AllAddr_gfx11<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">; +defm GLOBAL_STORE_D16_HI_B8 : FLAT_Real_AllAddr_gfx11<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">; +defm GLOBAL_STORE_D16_HI_B16 : FLAT_Real_AllAddr_gfx11<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_ADDTID_B32 : FLAT_Real_AllAddr_gfx11<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">; +defm GLOBAL_STORE_ADDTID_B32 : FLAT_Real_AllAddr_gfx11<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">; +defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>; +defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>; +defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>; +defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>; +defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>; +defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>; +defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>; +defm GLOBAL_ATOMIC_MAX_U32 : FLAT_Real_GlblAtomics_gfx11<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>; +defm GLOBAL_ATOMIC_AND_B32 : FLAT_Real_GlblAtomics_gfx11<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>; +defm GLOBAL_ATOMIC_OR_B32 : FLAT_Real_GlblAtomics_gfx11<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>; +defm GLOBAL_ATOMIC_XOR_B32 : FLAT_Real_GlblAtomics_gfx11<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>; +defm GLOBAL_ATOMIC_INC_U32 : FLAT_Real_GlblAtomics_gfx11<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>; +defm GLOBAL_ATOMIC_DEC_U32 : FLAT_Real_GlblAtomics_gfx11<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>; +defm GLOBAL_ATOMIC_SWAP_B64 : FLAT_Real_GlblAtomics_gfx11<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>; +defm GLOBAL_ATOMIC_CMPSWAP_B64 : FLAT_Real_GlblAtomics_gfx11<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>; +defm GLOBAL_ATOMIC_ADD_U64 : FLAT_Real_GlblAtomics_gfx11<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>; +defm GLOBAL_ATOMIC_SUB_U64 : FLAT_Real_GlblAtomics_gfx11<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>; +defm GLOBAL_ATOMIC_MIN_I64 : FLAT_Real_GlblAtomics_gfx11<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>; +defm GLOBAL_ATOMIC_MIN_U64 : FLAT_Real_GlblAtomics_gfx11<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>; +defm GLOBAL_ATOMIC_MAX_I64 : FLAT_Real_GlblAtomics_gfx11<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>; +defm GLOBAL_ATOMIC_MAX_U64 : FLAT_Real_GlblAtomics_gfx11<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>; +defm GLOBAL_ATOMIC_AND_B64 : FLAT_Real_GlblAtomics_gfx11<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>; +defm GLOBAL_ATOMIC_OR_B64 : FLAT_Real_GlblAtomics_gfx11<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>; +defm GLOBAL_ATOMIC_XOR_B64 : FLAT_Real_GlblAtomics_gfx11<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>; +defm GLOBAL_ATOMIC_INC_U64 : FLAT_Real_GlblAtomics_gfx11<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>; +defm GLOBAL_ATOMIC_DEC_U64 : FLAT_Real_GlblAtomics_gfx11<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>; +defm GLOBAL_ATOMIC_CMPSWAP_F32 : FLAT_Real_GlblAtomics_gfx11<0x050, "GLOBAL_ATOMIC_FCMPSWAP", "global_atomic_cmpswap_f32">; +defm GLOBAL_ATOMIC_MIN_F32 : FLAT_Real_GlblAtomics_gfx11<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_MAX_F32 : FLAT_Real_GlblAtomics_gfx11<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_GlblAtomics_gfx11<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; + +// ENC_FLAT_SCRATCH. +defm SCRATCH_LOAD_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; +defm SCRATCH_LOAD_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>; +defm SCRATCH_LOAD_U16 : FLAT_Real_ScratchAllAddr_gfx11<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>; +defm SCRATCH_LOAD_I16 : FLAT_Real_ScratchAllAddr_gfx11<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>; +defm SCRATCH_LOAD_B32 : FLAT_Real_ScratchAllAddr_gfx11<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>; +defm SCRATCH_LOAD_B64 : FLAT_Real_ScratchAllAddr_gfx11<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>; +defm SCRATCH_LOAD_B96 : FLAT_Real_ScratchAllAddr_gfx11<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>; +defm SCRATCH_LOAD_B128 : FLAT_Real_ScratchAllAddr_gfx11<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>; +defm SCRATCH_STORE_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>; +defm SCRATCH_STORE_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>; +defm SCRATCH_STORE_B32 : FLAT_Real_ScratchAllAddr_gfx11<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>; +defm SCRATCH_STORE_B64 : FLAT_Real_ScratchAllAddr_gfx11<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>; +defm SCRATCH_STORE_B96 : FLAT_Real_ScratchAllAddr_gfx11<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>; +defm SCRATCH_STORE_B128 : FLAT_Real_ScratchAllAddr_gfx11<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>; +defm SCRATCH_LOAD_D16_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">; +defm SCRATCH_LOAD_D16_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">; +defm SCRATCH_LOAD_D16_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">; +defm SCRATCH_LOAD_D16_HI_U8 : FLAT_Real_ScratchAllAddr_gfx11<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">; +defm SCRATCH_LOAD_D16_HI_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">; +defm SCRATCH_LOAD_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">; +defm SCRATCH_STORE_D16_HI_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">; +defm SCRATCH_STORE_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index a8c85ec4e5ea..1cd880eaa48e 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -167,7 +167,9 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { return nullptr; case AMDGPU::COPY: case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B64_PSEUDO: { + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_MOV_B64_e64: { auto &Op1 = Def->getOperand(1); if (Op1.isImm()) return &Op1; @@ -183,6 +185,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool CombBCZ, bool IsShrinkable) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); auto OrigOp = OrigMI.getOpcode(); @@ -383,6 +386,7 @@ bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); @@ -399,7 +403,8 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } - if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); assert(DppCtrl && DppCtrl->isImm()); if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) { @@ -447,12 +452,6 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } - if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) { - LLVM_DEBUG(dbgs() << - " failed: old reg def and mov should be in the same BB\n"); - return false; - } - if (OldOpndValue->getImm() == 0) { if (MaskAllLanes) { assert(!BoundCtrlZero); // by check [1] @@ -616,7 +615,8 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { Changed = true; ++NumDPPMovsCombined; - } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || + MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { if (ST->has64BitDPP() && combineDPPMov(MI)) { Changed = true; ++NumDPPMovsCombined; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c0592f6f3c7a..b6d16009e776 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -13,14 +13,38 @@ #include "GCNHazardRecognizer.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/TargetParser.h" using namespace llvm; +namespace { + +struct MFMAPaddingRatioParser : public cl::parser<unsigned> { + MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} + + bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { + if (Arg.getAsInteger(0, Value)) + return O.error("'" + Arg + "' value invalid for uint argument!"); + + if (Value > 100) + return O.error("'" + Arg + "' value must be in the range [0, 100]!"); + + return false; + } +}; + +} // end anonymous namespace + +static cl::opt<unsigned, false, MFMAPaddingRatioParser> + MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, + cl::desc("Fill a percentage of the latency between " + "neighboring MFMA with s_nops.")); + //===----------------------------------------------------------------------===// -// Hazard Recoginizer Implementation +// Hazard Recognizer Implementation //===----------------------------------------------------------------------===// static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, @@ -92,12 +116,7 @@ static bool isSMovRel(unsigned Opcode) { } static bool isDGEMM(unsigned Opcode) { - return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || - Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64; + return AMDGPU::getMAIIsDGEMM(Opcode); } static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { @@ -109,7 +128,10 @@ static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; - return true; + if (!ST.hasGFX940Insts()) + return true; + + return AMDGPU::getMAIIsGFX940XDL(Opcode); } static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, @@ -144,6 +166,11 @@ static bool isPermlane(const MachineInstr &MI) { Opcode == AMDGPU::V_PERMLANEX16_B32_e64; } +static bool isLdsDma(const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && + (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -204,12 +231,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return HazardType; - if (ST.hasReadM0MovRelInterpHazard() && - (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && - checkReadM0Hazards(MI) > 0) - return HazardType; - - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && + if (((ST.hasReadM0MovRelInterpHazard() && + (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) || + (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || + (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || + (ST.hasReadM0LdsDirectHazard() && + MI->readsRegister(AMDGPU::LDS_DIRECT))) && checkReadM0Hazards(MI) > 0) return HazardType; @@ -237,6 +264,14 @@ static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, } } +unsigned +GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { + const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI); + assert(TSchedModel.getWriteProcResBegin(SC) != + TSchedModel.getWriteProcResEnd(SC)); + return TSchedModel.getWriteProcResBegin(SC)->Cycles; +} + void GCNHazardRecognizer::processBundle() { MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); @@ -321,11 +356,11 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { if (isRFE(MI->getOpcode())) return std::max(WaitStates, checkRFEHazards(MI)); - if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || - isSMovRel(MI->getOpcode()))) - return std::max(WaitStates, checkReadM0Hazards(MI)); - - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) + if ((ST.hasReadM0MovRelInterpHazard() && + (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))) || + (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) || + (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) || + (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT))) return std::max(WaitStates, checkReadM0Hazards(MI)); if (SIInstrInfo::isMAI(*MI)) @@ -389,16 +424,61 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// +typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult; + typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn; +typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn; + +// Search for a hazard in a block and its predecessors. +template <typename StateT> +static bool +hasHazard(StateT State, + function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, + function_ref<void(StateT &, const MachineInstr &)> UpdateState, + const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, + DenseSet<const MachineBasicBlock *> &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // No need to look at parent BUNDLE instructions. + if (I->isBundle()) + continue; + + switch (IsHazard(State, *I)) { + case HazardFound: + return true; + case HazardExpired: + return false; + default: + // Continue search + break; + } + + if (I->isInlineAsm() || I->isMetaInstruction()) + continue; + + UpdateState(State, *I); + } + + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), + Visited)) + return true; + } + + return false; +} // Returns a minimum wait states since \p I walking all predecessors. // Only scans until \p IsExpired does not return true. // Can only be run in a hazard recognizer mode. -static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, - const MachineBasicBlock *MBB, - MachineBasicBlock::const_reverse_instr_iterator I, - int WaitStates, IsExpiredFn IsExpired, - DenseSet<const MachineBasicBlock *> &Visited) { +static int getWaitStatesSince( + GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, + MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, + IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, + GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { for (auto E = MBB->instr_rend(); I != E; ++I) { // Don't add WaitStates for parent BUNDLE instructions. if (I->isBundle()) @@ -410,7 +490,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, if (I->isInlineAsm()) continue; - WaitStates += SIInstrInfo::getNumWaitStates(*I); + WaitStates += GetNumWaitStates(*I); if (IsExpired(*I, WaitStates)) return std::numeric_limits<int>::max(); @@ -421,8 +501,8 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, if (!Visited.insert(Pred).second) continue; - int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), - WaitStates, IsExpired, Visited); + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates, + IsExpired, Visited, GetNumWaitStates); MinWaitStates = std::min(MinWaitStates, W); } @@ -534,7 +614,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // In order to handle these situations correctly we need to make sure that // when a clause has more than one instruction, no instruction in the clause // writes to a register that is read by another instruction in the clause - // (including itself). If we encounter this situaion, we need to break the + // (including itself). If we encounter this situation, we need to break the // clause by inserting a non SMEM instruction. for (MachineInstr *MI : EmittedInstrs) { @@ -764,7 +844,7 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, // 8 bytes can have there store data over written by the next instruction. const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const int VALUWaitStates = 1; + const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; int WaitStatesNeeded = 0; if (!TRI->isVectorRegister(MRI, Def.getReg())) @@ -783,13 +863,136 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, } int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { + int WaitStatesNeeded = 0; + + if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) { + const int TransDefWaitstates = 1; + + auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { + if (!SIInstrInfo::isTRANS(MI)) + return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg(); + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) + return true; + } + + return false; + }; + + int WaitStatesNeededForDef = + TransDefWaitstates - + getWaitStatesSince(IsTransDefFn, TransDefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + + if (ST.hasDstSelForwardingHazard()) { + const int Shift16DefWaitstates = 1; + + auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { + if (!SIInstrInfo::isVALU(MI)) + return false; + const SIInstrInfo *TII = ST.getInstrInfo(); + if (SIInstrInfo::isSDWA(MI)) { + if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) + if (DstSel->getImm() == AMDGPU::SDWA::DWORD) + return false; + } else { + if ((AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::op_sel) == -1) || + !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers) + ->getImm() & + SISrcMods::DST_OP_SEL)) + return false; + } + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + Register Def = Dst->getReg(); + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg())) + return true; + } + } + + return false; + }; + + int WaitStatesNeededForDef = + Shift16DefWaitstates - + getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + + if (ST.hasVDecCoExecHazard()) { + const int VALUWriteSGPRVALUReadWaitstates = 2; + const int VALUWriteEXECRWLane = 4; + const int VALUWriteVGPRReadlaneRead = 1; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + Register UseReg; + auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { + if (!SIInstrInfo::isVALU(MI)) + return false; + return MI.modifiesRegister(UseReg, TRI); + }; + + for (const MachineOperand &Use : VALU->explicit_uses()) { + if (!Use.isReg()) + continue; + + UseReg = Use.getReg(); + if (TRI->isSGPRReg(MRI, UseReg)) { + int WaitStatesNeededForDef = + VALUWriteSGPRVALUReadWaitstates - + getWaitStatesSince(IsVALUDefSGPRFn, + VALUWriteSGPRVALUReadWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + } + + if (VALU->readsRegister(AMDGPU::VCC, TRI)) { + UseReg = AMDGPU::VCC; + int WaitStatesNeededForDef = + VALUWriteSGPRVALUReadWaitstates - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + + switch (VALU->getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READFIRSTLANE_B32: { + MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0); + UseReg = Src->getReg(); + int WaitStatesNeededForDef = + VALUWriteVGPRReadlaneRead - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + } + LLVM_FALLTHROUGH; + case AMDGPU::V_WRITELANE_B32: { + UseReg = AMDGPU::EXEC; + int WaitStatesNeededForDef = + VALUWriteEXECRWLane - + getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + break; + } + default: + break; + } + } + // This checks for the hazard where VMEM instructions that store more than // 8 bytes can have there store data over written by the next instruction. if (!ST.has12DWordStoreHazard()) - return 0; + return WaitStatesNeeded; const MachineRegisterInfo &MRI = MF.getRegInfo(); - int WaitStatesNeeded = 0; for (const MachineOperand &Def : VALU->defs()) { WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); @@ -861,10 +1064,10 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); - const int SMovRelWaitStates = 1; + const int ReadM0WaitStates = 1; auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; - return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, - SMovRelWaitStates); + return ReadM0WaitStates - + getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates); } void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { @@ -873,6 +1076,13 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixSMEMtoVectorWriteHazards(MI); fixVcmpxExecWARHazard(MI); fixLdsBranchVmemWARHazard(MI); + if (ST.hasLdsDirect()) { + fixLdsDirectVALUHazard(MI); + fixLdsDirectVMEMHazard(MI); + } + fixVALUPartialForwardingHazard(MI); + fixVALUTransUseHazard(MI); + fixWMMAHazards(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -880,7 +1090,12 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { return false; const SIInstrInfo *TII = ST.getInstrInfo(); - auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); }; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { + return (TII->isVOPC(MI) || + ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && + MI.modifiesRegister(AMDGPU::EXEC, TRI); + }; auto IsExpiredFn = [](const MachineInstr &MI, int) { unsigned Opc = MI.getOpcode(); @@ -893,7 +1108,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { return false; // V_NOP will be discarded by SQ. - // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* // which is always a VGPR and available. auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); Register Reg = Src0->getReg(); @@ -1157,6 +1372,369 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { return true; } +bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { + if (!SIInstrInfo::isLDSDIR(*MI)) + return false; + + const int NoHazardWaitStates = 15; + const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); + const Register VDSTReg = VDST->getReg(); + + bool VisitedTrans = false; + auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { + if (!SIInstrInfo::isVALU(I)) + return false; + VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I); + // Cover both WAR and WAW + return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); + }; + auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { + if (WaitStates >= NoHazardWaitStates) + return true; + // Instructions which cause va_vdst==0 expire hazard + return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I); + }; + auto GetWaitStatesFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) ? 1 : 0; + }; + + DenseSet<const MachineBasicBlock *> Visited; + auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(), + std::next(MI->getReverseIterator()), 0, + IsExpiredFn, Visited, GetWaitStatesFn); + + // Transcendentals can execute in parallel to other VALUs. + // This makes va_vdst count unusable with a mixture of VALU and TRANS. + if (VisitedTrans) + Count = 0; + + MachineOperand *WaitVdstOp = + TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst); + WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates)); + + return true; +} + +bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { + if (!SIInstrInfo::isLDSDIR(*MI)) + return false; + + const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); + const Register VDSTReg = VDST->getReg(); + + auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { + if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) && + !SIInstrInfo::isDS(I)) + return false; + return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI); + }; + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0xffe3); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xffe3); + + return true; +} + +bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { + if (!ST.isWave64()) + return false; + if (!ST.hasVALUPartialForwardingHazard()) + return false; + if (!SIInstrInfo::isVALU(*MI)) + return false; + + SmallSetVector<Register, 4> SrcVGPRs; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + SrcVGPRs.insert(Use.getReg()); + } + + // Only applies with >= 2 unique VGPR sources + if (SrcVGPRs.size() <= 1) + return false; + + // Look for the following pattern: + // Va <- VALU [PreExecPos] + // intv1 + // Exec <- SALU [ExecPos] + // intv2 + // Vb <- VALU [PostExecPos] + // intv3 + // MI Va, Vb (WaitState = 0) + // + // Where: + // intv1 + intv2 <= 2 VALUs + // intv3 <= 4 VALUs + // + // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. + + const int Intv1plus2MaxVALUs = 2; + const int Intv3MaxVALUs = 4; + const int IntvMaxVALUs = 6; + const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; + + struct StateType { + SmallDenseMap<Register, int, 4> DefPos; + int ExecPos = std::numeric_limits<int>::max(); + int VALUs = 0; + }; + + StateType State; + + // This overloads expiry testing with all the hazard detection + auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { + // Too many VALU states have passed + if (State.VALUs > NoHazardVALUWaitStates) + return HazardExpired; + + // Instructions which cause va_vdst==0 expire hazard + if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0x0fff)) + return HazardExpired; + + // Track registers writes + bool Changed = false; + if (SIInstrInfo::isVALU(I)) { + for (Register Src : SrcVGPRs) { + if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) { + State.DefPos[Src] = State.VALUs; + Changed = true; + } + } + } else if (SIInstrInfo::isSALU(I)) { + if (State.ExecPos == std::numeric_limits<int>::max()) { + if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) { + State.ExecPos = State.VALUs; + Changed = true; + } + } + } + + // Early expiration: too many VALUs in intv3 + if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) + return HazardExpired; + + // Only evaluate state if something changed + if (!Changed) + return NoHazardFound; + + // Determine positions of VALUs pre/post exec change + if (State.ExecPos == std::numeric_limits<int>::max()) + return NoHazardFound; + + int PreExecPos = std::numeric_limits<int>::max(); + int PostExecPos = std::numeric_limits<int>::max(); + + for (auto Entry : State.DefPos) { + int DefVALUs = Entry.second; + if (DefVALUs != std::numeric_limits<int>::max()) { + if (DefVALUs >= State.ExecPos) + PreExecPos = std::min(PreExecPos, DefVALUs); + else if (DefVALUs < State.ExecPos) + PostExecPos = std::min(PostExecPos, DefVALUs); + } + } + + // Need a VALUs post exec change + if (PostExecPos == std::numeric_limits<int>::max()) + return NoHazardFound; + + // Too many VALUs in intv3? + int Intv3VALUs = PostExecPos; + if (Intv3VALUs > Intv3MaxVALUs) + return HazardExpired; + + // Too many VALUs in intv2? + int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; + if (Intv2VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + // Need a VALUs pre exec change + if (PreExecPos == std::numeric_limits<int>::max()) + return NoHazardFound; + + // Too many VALUs in intv1? + int Intv1VALUs = PreExecPos - State.ExecPos; + if (Intv1VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + // Too many VALUs in intv1 + intv2 + if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) + return HazardExpired; + + return HazardFound; + }; + auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) + State.VALUs += 1; + }; + + DenseSet<const MachineBasicBlock *> Visited; + if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), + std::next(MI->getReverseIterator()), Visited)) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0x0fff); + + return true; +} + +bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { + if (!ST.hasVALUTransUseHazard()) + return false; + if (!SIInstrInfo::isVALU(*MI)) + return false; + + SmallSet<Register, 4> SrcVGPRs; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + SrcVGPRs.insert(Use.getReg()); + } + + // Look for the following pattern: + // Va <- TRANS VALU + // intv + // MI Va (WaitState = 0) + // + // Where: + // intv <= 5 VALUs / 1 TRANS + // + // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. + + const int IntvMaxVALUs = 5; + const int IntvMaxTRANS = 1; + + struct StateType { + int VALUs = 0; + int TRANS = 0; + }; + + StateType State; + + // This overloads expiry testing with all the hazard detection + auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { + // Too many VALU states have passed + if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) + return HazardExpired; + + // Instructions which cause va_vdst==0 expire hazard + if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || + SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || + (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + I.getOperand(0).getImm() == 0x0fff)) + return HazardExpired; + + // Track registers writes + if (SIInstrInfo::isTRANS(I)) { + for (Register Src : SrcVGPRs) { + if (I.modifiesRegister(Src, &TRI)) { + return HazardFound; + } + } + } + + return NoHazardFound; + }; + auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) + State.VALUs += 1; + if (SIInstrInfo::isTRANS(MI)) + State.TRANS += 1; + }; + + DenseSet<const MachineBasicBlock *> Visited; + if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(), + std::next(MI->getReverseIterator()), Visited)) + return false; + + // Hazard is observed - insert a wait on va_dst counter to ensure hazard is + // avoided (mask 0x0fff achieves this). + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII.get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0x0fff); + + return true; +} + +bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { + if (!SIInstrInfo::isWMMA(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isWMMA(I)) + return false; + + // Src0 or Src1 of the current wmma instruction overlaps with the dest of + // the previous wmma. + const Register CurSrc0Reg = + TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg(); + const Register CurSrc1Reg = + TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); + + const Register PrevDstReg = + TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + + if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) || + TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) { + return true; + } + + // Src2 of the current wmma instruction overlaps with the dest of the + // previous wmma. + const MachineOperand *Src2 = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2); + const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register(); + + if (CurSrc2Reg != AMDGPU::NoRegister && + TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) { + + const MachineOperand *Src2Mods = + TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers); + const bool NoSrc2Mods = + (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0; + // Exception: there is no hazard if the wmma instructions are of the same + // type and there is no input modifier on src2 of the current instruction. + return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) == + TII->pseudoToMCOpcode(MI->getOpcode()))); + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + + return true; +} + int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { int NSAtoVMEMWaitStates = 1; @@ -1223,6 +1801,36 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); } +int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { + // Early exit if no padding is requested. + if (MFMAPaddingRatio == 0) + return 0; + + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2) + return 0; + + int NeighborMFMALatency = 0; + auto IsNeighboringMFMA = [&NeighborMFMALatency, + this](const MachineInstr &MI) { + if (!SIInstrInfo::isMFMA(MI)) + return false; + + NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); + return true; + }; + + const int MaxMFMAPipelineWaitStates = 16; + int WaitStatesSinceNeighborMFMA = + getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates); + + int NeighborMFMAPaddingNeeded = + (NeighborMFMALatency * MFMAPaddingRatio / 100) - + WaitStatesSinceNeighborMFMA; + + return std::max(0, NeighborMFMAPaddingNeeded); +} + int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { int WaitStatesNeeded = 0; unsigned Opc = MI->getOpcode(); @@ -1257,12 +1865,6 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { } } - auto IsMFMAFn = [](const MachineInstr &MI) { - return SIInstrInfo::isMAI(MI) && - MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; - }; - for (const MachineOperand &Op : MI->explicit_operands()) { if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) continue; @@ -1282,9 +1884,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { Register Reg = Op.getReg(); unsigned HazardDefLatency = 0; - auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, + auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, this](const MachineInstr &MI) { - if (!IsMFMAFn(MI)) + if (!SIInstrInfo::isMFMA(MI)) return false; Register DstReg = MI.getOperand(0).getReg(); if (DstReg == Reg) @@ -1361,9 +1963,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { Register DstReg = MI->getOperand(0).getReg(); unsigned HazardDefLatency = 0; - auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, + auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, this](const MachineInstr &MI) { - if (!IsMFMAFn(MI)) + if (!SIInstrInfo::isMFMA(MI)) return false; Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); HazardDefLatency = @@ -1387,6 +1989,9 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } + // Pad neighboring MFMA with noops for better inter-wave performance. + WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI)); + return WaitStatesNeeded; } @@ -1394,21 +1999,16 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { int WaitStatesNeeded = 0; unsigned Opc = MI->getOpcode(); - auto IsMFMAFn = [](const MachineInstr &MI) { - return SIInstrInfo::isMAI(MI) && - MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; - }; - - auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) { - return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI); + auto IsLegacyVALUFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); }; - auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) { - return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI); + auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && + !SIInstrInfo::isDOT(MI); }; - if (!IsMFMAFn(*MI)) + if (!SIInstrInfo::isMFMA(*MI)) return WaitStatesNeeded; const int VALUWritesExecWaitStates = 4; @@ -1423,6 +2023,13 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { for (const MachineOperand &Use : MI->explicit_uses()) { const int LegacyVALUNotDotWritesVGPRWaitStates = 2; const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; + const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3; + const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5; + const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4; + const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9; + const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8; + const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17; + const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16; const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; @@ -1433,9 +2040,18 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; + const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4; + const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6; + const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10; + const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18; + const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5; + const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7; + const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11; + const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19; const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; + const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; const int MaxWaitStates = 19; if (!Use.isReg()) @@ -1444,9 +2060,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { bool FullReg; const MachineInstr *MI1; - auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &FullReg, &MI1, + auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, this](const MachineInstr &MI) { - if (!IsMFMAFn(MI)) + if (!SIInstrInfo::isMFMA(MI)) return false; Register DstReg = MI.getOperand(0).getReg(); FullReg = (DstReg == Reg); @@ -1467,7 +2083,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { unsigned Opc1 = MI1->getOpcode(); int NeedWaitStates = 0; if (OpNo == SrcCIdx) { - if (!isDGEMM(Opc) && isDGEMM(Opc1)) { + if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) { NeedWaitStates = 0; } else if (FullReg) { if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || @@ -1475,6 +2091,9 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; + else if (ST.hasGFX940Insts() && + TSchedModel.computeInstrLatency(MI1) == 2) + NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; } else { switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: @@ -1490,22 +2109,42 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; break; default: + if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1)) + break; switch (TSchedModel.computeInstrLatency(MI1)) { case 2: - NeedWaitStates = isDGEMM(Opc) - ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates - : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates + : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates + : isDGEMM(Opc) + ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; + break; + case 4: + assert(ST.hasGFX940Insts()); + NeedWaitStates = isXDL(ST, *MI1) + ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates + : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates; break; case 8: - NeedWaitStates = isDGEMM(Opc) - ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates - : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates + : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates + : isDGEMM(Opc) + ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; break; case 16: LLVM_FALLTHROUGH; default: - NeedWaitStates = isDGEMM(Opc) - ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates - : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates + : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates + : isDGEMM(Opc) + ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; } } } @@ -1524,14 +2163,32 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { default: switch (TSchedModel.computeInstrLatency(MI1)) { case 2: - NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates + : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates + : SMFMA4x4WritesVGPROverlappedSrcABWaitStates; + break; + case 4: + assert(ST.hasGFX940Insts()); + NeedWaitStates = isXDL(ST, *MI1) + ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates + : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates; break; case 8: - NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates + : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates + : SMFMA16x16WritesVGPROverlappedSrcABWaitStates; break; case 16: LLVM_FALLTHROUGH; default: - NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MI1) + ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates + : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates + : SMFMA32x32WritesVGPROverlappedSrcABWaitStates; } } } @@ -1599,18 +2256,12 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { if (!ST.hasGFX90AInsts()) return 0; - auto IsMFMAFn = [](const MachineInstr &MI) -> bool { - return SIInstrInfo::isMAI(MI) && - MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; - }; - auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { return isDGEMM(MI.getOpcode()); }; // This is checked in checkMAIHazards90A() - if (IsMFMAFn(*MI)) + if (SIInstrInfo::isMFMA(*MI)) return 0; int WaitStatesNeeded = 0; @@ -1623,8 +2274,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { const MachineInstr *MFMA = nullptr; unsigned Reg; - auto IsMFMAWriteFn = [&Reg, &IsMFMAFn, &MFMA, this](const MachineInstr &MI) { - if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) + auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { + if (!SIInstrInfo::isMFMA(MI) || + !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg)) return false; MFMA = &MI; return true; @@ -1646,6 +2298,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; + const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4; + const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6; + const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10; + const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18; + const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5; + const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7; + const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11; + const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19; const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; @@ -1685,16 +2345,30 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { int NeedWaitStates = MaxWaitStates; switch (HazardDefLatency) { case 2: - NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; + NeedWaitStates = + ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates + : SMFMA4x4WriteVgprVALUMemExpReadWaitStates; break; case 4: - assert(isDGEMM(MFMA->getOpcode())); + assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts()); NeedWaitStates = - IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates - : DMFMA4x4WriteVgprVALUReadWaitStates; + isDGEMM(MFMA->getOpcode()) + ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates + : DMFMA4x4WriteVgprVALUReadWaitStates + : isXDL(ST, *MFMA) + ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates; break; case 8: - NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; + NeedWaitStates = + ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates + : SMFMA16x16WriteVgprVALUMemExpReadWaitStates; break; case 16: LLVM_FALLTHROUGH; default: @@ -1702,7 +2376,11 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { isDGEMM(MFMA->getOpcode()) ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates : DMFMA16x16WriteVgprVALUReadWaitStates - : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; + : ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates + : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates + : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; break; } @@ -1732,7 +2410,16 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; + const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4; + const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6; + const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10; + const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18; + const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5; + const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7; + const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11; + const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19; const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; + const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; @@ -1757,19 +2444,35 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { int NeedWaitStates = MaxWaitStates; switch (TSchedModel.computeInstrLatency(MFMA)) { case 2: - NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL2PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates + : SMFMA4x4WriteVgprVALUWawWaitStates; break; case 4: - assert(isDGEMM(MFMA->getOpcode())); - NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; + assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts()); + NeedWaitStates = isDGEMM(MFMA->getOpcode()) + ? DMFMA4x4WriteVgprVALUWriteWaitStates + : isXDL(ST, *MFMA) + ? GFX940_XDL4PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates; break; case 8: - NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; + NeedWaitStates = ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL8PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates + : SMFMA16x16WriteVgprVALUWawWaitStates; break; case 16: LLVM_FALLTHROUGH; default: NeedWaitStates = isDGEMM(MFMA->getOpcode()) ? DMFMA16x16WriteVgprVALUWriteWaitStates + : ST.hasGFX940Insts() + ? isXDL(ST, *MFMA) + ? GFX940_XDL16PassWriteVgprVALUWawWaitStates + : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates : SMFMA32x32WriteVgprVALUWawWaitStates; break; } @@ -1781,12 +2484,14 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { break; } - auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA, - this](const MachineInstr &MI) { - if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) || + auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { + if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) || !MI.readsRegister(Reg, &TRI)) return false; + if (ST.hasGFX940Insts() && !isXDL(ST, MI)) + return false; + const MachineOperand *SrcC = TII.getNamedOperand(MI, AMDGPU::OpName::src2); assert(SrcC); @@ -1808,6 +2513,9 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { switch (HazardDefLatency) { case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; break; + case 4: assert(ST.hasGFX940Insts()); + NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; + break; case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; break; case 16: LLVM_FALLTHROUGH; @@ -1827,11 +2535,10 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { return false; const MachineInstr *MAI = nullptr; + auto IsMFMAFn = [&MAI](const MachineInstr &MI) { MAI = nullptr; - if (SIInstrInfo::isMAI(MI) && - MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) + if (SIInstrInfo::isMFMA(MI)) MAI = &MI; return MAI != nullptr; }; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 716bc027a894..57f5a04c6eda 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -62,6 +62,10 @@ private: void addClauseInst(const MachineInstr &MI); + /// \returns the number of wait states before another MFMA instruction can be + /// issued after \p MI. + unsigned getMFMAPipelineWaitStates(const MachineInstr &MI) const; + // Advance over a MachineInstr bundle. Look for hazards in the bundled // instructions. void processBundle(); @@ -92,10 +96,31 @@ private: bool fixSMEMtoVectorWriteHazards(MachineInstr *MI); bool fixVcmpxExecWARHazard(MachineInstr *MI); bool fixLdsBranchVmemWARHazard(MachineInstr *MI); + bool fixLdsDirectVALUHazard(MachineInstr *MI); + bool fixLdsDirectVMEMHazard(MachineInstr *MI); + bool fixVALUPartialForwardingHazard(MachineInstr *MI); + bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixWMMAHazards(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); int checkMAIHazards90A(MachineInstr *MI); + /// Pad the latency between neighboring MFMA instructions with s_nops. The + /// percentage of wait states to fill with s_nops is specified by the command + /// line option '-amdgpu-mfma-padding-ratio'. + /// + /// For example, with '-amdgpu-mfma-padding-ratio=100': + /// + /// 2 pass MFMA instructions have a latency of 2 wait states. Therefore, a + /// 'S_NOP 1' will be added between sequential MFMA instructions. + /// + /// V_MFMA_F32_4X4X1F32 + /// V_MFMA_F32_4X4X1F32 + ///--> + /// V_MFMA_F32_4X4X1F32 + /// S_NOP 1 + /// V_MFMA_F32_4X4X1F32 + int checkMFMAPadding(MachineInstr *MI); int checkMAIVALUHazards(MachineInstr *MI); int checkMAILdStHazards(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 9f98f9ada802..6f82148854c4 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -1,4 +1,4 @@ -//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,7 +8,7 @@ // /// \file /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential -/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA /// with sequential versions where possible. /// //===----------------------------------------------------------------------===// @@ -16,10 +16,12 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -159,15 +161,23 @@ GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { GCNNSAReassign::NSA_Status GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info) return NSA_Status::NOT_NSA; + switch (Info->MIMGEncoding) { + case AMDGPU::MIMGEncGfx10NSA: + case AMDGPU::MIMGEncGfx11NSA: + break; + default: + return NSA_Status::NOT_NSA; + } + int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); unsigned VgprBase = 0; bool NSA = false; - for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + for (unsigned I = 0; I < Info->VAddrOperands; ++I) { const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); Register Reg = Op.getReg(); if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) @@ -179,15 +189,16 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { if (!PhysReg) return NSA_Status::FIXED; + // TODO: address the below limitation to handle GFX11 BVH instructions // Bail if address is not a VGPR32. That should be possible to extend the // optimization to work with subregs of a wider register tuples, but the // logic to find free registers will be much more complicated with much // less chances for success. That seems reasonable to assume that in most // cases a tuple is used because a vector variable contains different - // parts of an address and it is either already consequitive or cannot + // parts of an address and it is either already consecutive or cannot // be reassigned if not. If needed it is better to rely on register // coalescer to process such address tuples. - if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) return NSA_Status::FIXED; // InlineSpiller does not call LRM::assign() after an LI split leaving @@ -278,7 +289,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { SmallVector<LiveInterval *, 16> Intervals; SmallVector<MCRegister, 16> OrigRegs; SlotIndex MinInd, MaxInd; - for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + for (unsigned I = 0; I < Info->VAddrOperands; ++I) { const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); Register Reg = Op.getReg(); LiveInterval *LI = &LIS->getInterval(Reg); @@ -331,11 +342,11 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { } if (!Success) { - for (unsigned I = 0; I < Info->VAddrDwords; ++I) + for (unsigned I = 0; I < Info->VAddrOperands; ++I) if (VRM->hasPhys(Intervals[I]->reg())) LRM->unassign(*Intervals[I]); - for (unsigned I = 0; I < Info->VAddrDwords; ++I) + for (unsigned I = 0; I < Info->VAddrOperands; ++I) LRM->assign(*Intervals[I], OrigRegs[I]); continue; diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 3a68ed1934e1..281474994bca 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -192,6 +192,10 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; +def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, + FeatureISAVersion9_4_0.Features +>; + //===----------------------------------------------------------------------===// // GCN GFX10. //===----------------------------------------------------------------------===// @@ -235,3 +239,27 @@ def : ProcessorModel<"gfx1034", GFX10SpeedModel, def : ProcessorModel<"gfx1035", GFX10SpeedModel, FeatureISAVersion10_3_0.Features >; + +def : ProcessorModel<"gfx1036", GFX10SpeedModel, + FeatureISAVersion10_3_0.Features +>; + +//===----------------------------------------------------------------------===// +// GCN GFX11. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1100", GFX11SpeedModel, + FeatureISAVersion11_0.Features +>; + +def : ProcessorModel<"gfx1101", GFX11SpeedModel, + FeatureISAVersion11_0.Features +>; + +def : ProcessorModel<"gfx1102", GFX11SpeedModel, + FeatureISAVersion11_0_2.Features +>; + +def : ProcessorModel<"gfx1103", GFX11SpeedModel, + FeatureISAVersion11_0_2.Features +>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 257561cb8430..c41548d19c8e 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -10,7 +10,7 @@ /// This file defines the GCNRegPressure class, which tracks registry pressure /// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It /// also implements a compare function, which compares different register -/// pressures, and declares one with max occupance as winner. +/// pressures, and declares one with max occupancy as winner. /// //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 75855a7a4f9c..100410bb7644 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -13,6 +13,7 @@ #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #define DEBUG_TYPE "machine-scheduler" @@ -362,6 +363,9 @@ void GCNScheduleDAGMILive::schedule() { if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { Pressure[RegionIdx] = PressureAfter; + RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == MinOccupancy; + LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; } @@ -378,6 +382,7 @@ void GCNScheduleDAGMILive::schedule() { // occupancy before was higher, or if the current schedule has register // pressure higher than the excess limits which could lead to more spilling. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + // Allow memory bound functions to drop to 4 waves if not limited by an // attribute. if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && @@ -390,6 +395,7 @@ void GCNScheduleDAGMILive::schedule() { if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; MFI.limitOccupancy(MinOccupancy); + RegionsWithMinOcc.reset(); LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << MinOccupancy << ".\n"); } @@ -416,6 +422,8 @@ void GCNScheduleDAGMILive::schedule() { PressureAfter.less(ST, PressureBefore) || !RescheduleRegions[RegionIdx]) { Pressure[RegionIdx] = PressureAfter; + RegionsWithMinOcc[RegionIdx] = + PressureAfter.getOccupancy(ST) == MinOccupancy; if (!RegionsWithClusters[RegionIdx] && (Stage + 1) == UnclusteredReschedule) RescheduleRegions[RegionIdx] = false; @@ -425,13 +433,18 @@ void GCNScheduleDAGMILive::schedule() { } } + RegionsWithMinOcc[RegionIdx] = + PressureBefore.getOccupancy(ST) == MinOccupancy; LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] || (Stage + 1) != UnclusteredReschedule; RegionEnd = RegionBegin; + int SkippedDebugInstr = 0; for (MachineInstr *MI : Unsched) { - if (MI->isDebugInstr()) + if (MI->isDebugInstr()) { + ++SkippedDebugInstr; continue; + } if (MI->getIterator() != RegionEnd) { BB->remove(MI); @@ -459,10 +472,31 @@ void GCNScheduleDAGMILive::schedule() { ++RegionEnd; LLVM_DEBUG(dbgs() << "Scheduling " << *MI); } + + // After reverting schedule, debug instrs will now be at the end of the block + // and RegionEnd will point to the first debug instr. Increment RegionEnd + // pass debug instrs to the actual end of the scheduling region. + while (SkippedDebugInstr-- > 0) + ++RegionEnd; + + // If Unsched.front() instruction is a debug instruction, this will actually + // shrink the region since we moved all debug instructions to the end of the + // block. Find the first instruction that is not a debug instruction. RegionBegin = Unsched.front()->getIterator(); - Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + if (RegionBegin->isDebugInstr()) { + for (MachineInstr *MI : Unsched) { + if (MI->isDebugInstr()) + continue; + RegionBegin = MI->getIterator(); + break; + } + } + // Then move the debug instructions back into their correct place and set + // RegionBegin and RegionEnd if needed. placeDebugValues(); + + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); } GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { @@ -493,14 +527,14 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { auto I = MBB->begin(); auto LiveInIt = MBBLiveIns.find(MBB); + auto &Rgn = Regions[CurRegion]; + auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); if (LiveInIt != MBBLiveIns.end()) { auto LiveIn = std::move(LiveInIt->second); RPTracker.reset(*MBB->begin(), &LiveIn); MBBLiveIns.erase(LiveInIt); } else { - auto &Rgn = Regions[CurRegion]; I = Rgn.first; - auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); auto LRS = BBLiveInMap.lookup(NonDbgMI); #ifdef EXPENSIVE_CHECKS assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); @@ -511,7 +545,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { for ( ; ; ) { I = RPTracker.getNext(); - if (Regions[CurRegion].first == I) { + if (Regions[CurRegion].first == I || NonDbgMI == I) { LiveIns[CurRegion] = RPTracker.getLiveRegs(); RPTracker.clearMaxPressure(); } @@ -561,9 +595,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() { RescheduleRegions.resize(Regions.size()); RegionsWithClusters.resize(Regions.size()); RegionsWithHighRP.resize(Regions.size()); + RegionsWithMinOcc.resize(Regions.size()); RescheduleRegions.set(); RegionsWithClusters.reset(); RegionsWithHighRP.reset(); + RegionsWithMinOcc.reset(); if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); @@ -600,13 +636,41 @@ void GCNScheduleDAGMILive::finalizeSchedule() { << "Retrying function scheduling with lowest recorded occupancy " << MinOccupancy << ".\n"); } + + if (Stage == PreRARematerialize) { + if (RegionsWithMinOcc.none() || Regions.size() == 1) + break; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + // Check maximum occupancy + if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) == + MinOccupancy) + break; + + // FIXME: This pass will invalidate cached MBBLiveIns for regions + // inbetween the defs and region we sinked the def to. Cached pressure + // for regions where a def is sinked from will also be invalidated. Will + // need to be fixed if there is another pass after this pass. + static_assert(LastStage == PreRARematerialize, + "Passes after PreRARematerialize are not supported"); + + collectRematerializableInstructions(); + if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) + break; + + LLVM_DEBUG( + dbgs() << "Retrying function scheduling with improved occupancy of " + << MinOccupancy << " from rematerializing\n"); + } } if (Stage == UnclusteredReschedule) SavedMutations.swap(Mutations); for (auto Region : Regions) { - if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) || + if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) && + !RescheduleRegions[RegionIdx]) || (Stage == ClusteredLowOccupancyReschedule && !RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) { @@ -631,6 +695,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() { // Skip empty scheduling regions (0 or 1 schedulable instructions). if (begin() == end() || begin() == std::prev(end())) { exitRegion(); + ++RegionIdx; continue; } @@ -653,3 +718,282 @@ void GCNScheduleDAGMILive::finalizeSchedule() { SavedMutations.swap(Mutations); } while (Stage != LastStage); } + +void GCNScheduleDAGMILive::collectRematerializableInstructions() { + const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + // TODO: Handle AGPR and SGPR rematerialization + if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) || + !MRI.hasOneNonDBGUse(Reg)) + continue; + + MachineOperand *Op = MRI.getOneDef(Reg); + MachineInstr *Def = Op->getParent(); + if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA)) + continue; + + MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg); + if (Def->getParent() == UseI->getParent()) + continue; + + // We are only collecting defs that are defined in another block and are + // live-through or used inside regions at MinOccupancy. This means that the + // register must be in the live-in set for the region. + bool AddedToRematList = false; + for (unsigned I = 0, E = Regions.size(); I != E; ++I) { + auto It = LiveIns[I].find(Reg); + if (It != LiveIns[I].end() && !It->second.none()) { + if (RegionsWithMinOcc[I]) { + RematerializableInsts[I][Def] = UseI; + AddedToRematList = true; + } + + // Collect regions with rematerializable reg as live-in to avoid + // searching later when updating RP. + RematDefToLiveInRegions[Def].push_back(I); + } + } + if (!AddedToRematList) + RematDefToLiveInRegions.erase(Def); + } +} + +bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST, + const TargetInstrInfo *TII) { + // Temporary copies of cached variables we will be modifying and replacing if + // sinking succeeds. + SmallVector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> + NewRegions; + DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; + DenseMap<unsigned, GCNRegPressure> NewPressure; + BitVector NewRescheduleRegions; + + NewRegions.resize(Regions.size()); + NewRescheduleRegions.resize(Regions.size()); + + // Collect only regions that has a rematerializable def as a live-in. + SmallSet<unsigned, 16> ImpactedRegions; + for (const auto &It : RematDefToLiveInRegions) + ImpactedRegions.insert(It.second.begin(), It.second.end()); + + // Make copies of register pressure and live-ins cache that will be updated + // as we rematerialize. + for (auto Idx : ImpactedRegions) { + NewPressure[Idx] = Pressure[Idx]; + NewLiveIns[Idx] = LiveIns[Idx]; + } + NewRegions = Regions; + NewRescheduleRegions.reset(); + + DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; + bool Improved = false; + for (auto I : ImpactedRegions) { + if (!RegionsWithMinOcc[I]) + continue; + + Improved = false; + int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts()); + int SGPRUsage = NewPressure[I].getSGPRNum(); + + // TODO: Handle occupancy drop due to AGPR and SGPR. + // Check if cause of occupancy drop is due to VGPR usage and not SGPR. + if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy) + break; + + // The occupancy of this region could have been improved by a previous + // iteration's sinking of defs. + if (NewPressure[I].getOccupancy(ST) > MinOccupancy) { + NewRescheduleRegions[I] = true; + Improved = true; + continue; + } + + // First check if we have enough trivially rematerializable instructions to + // improve occupancy. Optimistically assume all instructions we are able to + // sink decreased RP. + int TotalSinkableRegs = 0; + for (const auto &It : RematerializableInsts[I]) { + MachineInstr *Def = It.first; + Register DefReg = Def->getOperand(0).getReg(); + TotalSinkableRegs += + SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]); + } + int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; + unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink); + // If in the most optimistic scenario, we cannot improve occupancy, then do + // not attempt to sink any instructions. + if (OptimisticOccupancy <= MinOccupancy) + break; + + unsigned ImproveOccupancy = 0; + SmallVector<MachineInstr *, 4> SinkedDefs; + for (auto &It : RematerializableInsts[I]) { + MachineInstr *Def = It.first; + MachineBasicBlock::iterator InsertPos = + MachineBasicBlock::iterator(It.second); + Register Reg = Def->getOperand(0).getReg(); + // Rematerialize MI to its use block. Since we are only rematerializing + // instructions that do not have any virtual reg uses, we do not need to + // call LiveRangeEdit::allUsesAvailableAt() and + // LiveRangeEdit::canRematerializeAt(). + TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, + Def->getOperand(0).getSubReg(), *Def, *TRI); + MachineInstr *NewMI = &*(--InsertPos); + LIS->InsertMachineInstrInMaps(*NewMI); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + InsertedMIToOldDef[NewMI] = Def; + + // Update region boundaries in scheduling region we sinked from since we + // may sink an instruction that was at the beginning or end of its region + updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr, + /*Removing =*/true); + + // Update region boundaries in region we sinked to. + updateRegionBoundaries(NewRegions, InsertPos, NewMI); + + LaneBitmask PrevMask = NewLiveIns[I][Reg]; + // FIXME: Also update cached pressure for where the def was sinked from. + // Update RP for all regions that has this reg as a live-in and remove + // the reg from all regions as a live-in. + for (auto Idx : RematDefToLiveInRegions[Def]) { + NewLiveIns[Idx].erase(Reg); + if (InsertPos->getParent() != Regions[Idx].first->getParent()) { + // Def is live-through and not used in this block. + NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI); + } else { + // Def is used and rematerialized into this block. + GCNDownwardRPTracker RPT(*LIS); + auto *NonDbgMI = &*skipDebugInstructionsForward( + NewRegions[Idx].first, NewRegions[Idx].second); + RPT.reset(*NonDbgMI, &NewLiveIns[Idx]); + RPT.advance(NewRegions[Idx].second); + NewPressure[Idx] = RPT.moveMaxPressure(); + } + } + + SinkedDefs.push_back(Def); + ImproveOccupancy = NewPressure[I].getOccupancy(ST); + if (ImproveOccupancy > MinOccupancy) + break; + } + + // Remove defs we just sinked from all regions' list of sinkable defs + for (auto &Def : SinkedDefs) + for (auto TrackedIdx : RematDefToLiveInRegions[Def]) + RematerializableInsts[TrackedIdx].erase(Def); + + if (ImproveOccupancy <= MinOccupancy) + break; + + NewRescheduleRegions[I] = true; + Improved = true; + } + + if (!Improved) { + // Occupancy was not improved for all regions that were at MinOccupancy. + // Undo sinking and remove newly rematerialized instructions. + for (auto &Entry : InsertedMIToOldDef) { + MachineInstr *MI = Entry.first; + MachineInstr *OldMI = Entry.second; + Register Reg = MI->getOperand(0).getReg(); + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + OldMI->clearRegisterDeads(Reg); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + return false; + } + + // Occupancy was improved for all regions. + for (auto &Entry : InsertedMIToOldDef) { + MachineInstr *MI = Entry.first; + MachineInstr *OldMI = Entry.second; + + // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. + BBLiveInMap.erase(OldMI); + + // Remove OldMI and update LIS + Register Reg = MI->getOperand(0).getReg(); + LIS->RemoveMachineInstrFromMaps(*OldMI); + OldMI->eraseFromParent(); + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + + // Update live-ins, register pressure, and regions caches. + for (auto Idx : ImpactedRegions) { + LiveIns[Idx] = NewLiveIns[Idx]; + Pressure[Idx] = NewPressure[Idx]; + MBBLiveIns.erase(Regions[Idx].first->getParent()); + } + Regions = NewRegions; + RescheduleRegions = NewRescheduleRegions; + + SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + MFI.increaseOccupancy(MF, ++MinOccupancy); + + return true; +} + +// Copied from MachineLICM +bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) { + if (!TII->isTriviallyReMaterializable(MI, AA)) + return false; + + for (const MachineOperand &MO : MI.operands()) + if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) + return false; + + return true; +} + +// When removing, we will have to check both beginning and ending of the region. +// When inserting, we will only have to check if we are inserting NewMI in front +// of a scheduling region and do not need to check the ending since we will only +// ever be inserting before an already existing MI. +void GCNScheduleDAGMILive::updateRegionBoundaries( + SmallVectorImpl<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>> &RegionBoundaries, + MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { + unsigned I = 0, E = RegionBoundaries.size(); + // Search for first region of the block where MI is located + while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) + ++I; + + for (; I != E; ++I) { + if (MI->getParent() != RegionBoundaries[I].first->getParent()) + return; + + if (Removing && MI == RegionBoundaries[I].first && + MI == RegionBoundaries[I].second) { + // MI is in a region with size 1, after removing, the region will be + // size 0, set RegionBegin and RegionEnd to pass end of block iterator. + RegionBoundaries[I] = + std::make_pair(MI->getParent()->end(), MI->getParent()->end()); + return; + } + if (MI == RegionBoundaries[I].first) { + if (Removing) + RegionBoundaries[I] = + std::make_pair(std::next(MI), RegionBoundaries[I].second); + else + // Inserted NewMI in front of region, set new RegionBegin to NewMI + RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI), + RegionBoundaries[I].second); + return; + } + if (Removing && MI == RegionBoundaries[I].second) { + RegionBoundaries[I] = + std::make_pair(RegionBoundaries[I].first, std::prev(MI)); + return; + } + } +} diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index a6e42ad3dfca..97f94f69b70e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #include "GCNRegPressure.h" +#include "llvm/ADT/MapVector.h" #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { @@ -77,7 +78,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { InitialSchedule, UnclusteredReschedule, ClusteredLowOccupancyReschedule, - LastStage = ClusteredLowOccupancyReschedule + PreRARematerialize, + LastStage = PreRARematerialize }; const GCNSubtarget &ST; @@ -110,24 +112,56 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Record regions with high register pressure. BitVector RegionsWithHighRP; + // Regions that has the same occupancy as the latest MinOccupancy + BitVector RegionsWithMinOcc; + // Region live-in cache. SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; // Region pressure cache. SmallVector<GCNRegPressure, 32> Pressure; + // Each region at MinOccupancy will have their own list of trivially + // rematerializable instructions we can remat to reduce RP. The list maps an + // instruction to the position we should remat before, usually the MI using + // the rematerializable instruction. + MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>> + RematerializableInsts; + + // Map a trivially remateriazable def to a list of regions at MinOccupancy + // that has the defined reg as a live-in. + DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions; + // Temporary basic block live-in cache. DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns; DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap; DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const; + // Collect all trivially rematerializable VGPR instructions with a single def + // and single use outside the defining block into RematerializableInsts. + void collectRematerializableInstructions(); + + bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA); + + // TODO: Should also attempt to reduce RP of SGPRs and AGPRs + // Attempt to reduce RP of VGPR by sinking trivially rematerializable + // instructions. Returns true if we were able to sink instruction(s). + bool sinkTriviallyRematInsts(const GCNSubtarget &ST, + const TargetInstrInfo *TII); + // Return current region pressure. GCNRegPressure getRealRegPressure() const; // Compute and cache live-ins and pressure for all regions in block. void computeBlockPressure(const MachineBasicBlock *MBB); + // Update region boundaries when removing MI or inserting NewMI before MI. + void updateRegionBoundaries( + SmallVectorImpl<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>> &RegionBoundaries, + MachineBasicBlock::iterator MI, MachineInstr *NewMI, + bool Removing = false); public: GCNScheduleDAGMILive(MachineSchedContext *C, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0cd2cfa2f0e7..d269d0945f3b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -58,133 +58,142 @@ protected: // Basic subtarget description. Triple TargetTriple; AMDGPU::IsaInfo::AMDGPUTargetID TargetID; - unsigned Gen; + unsigned Gen = INVALID; InstrItineraryData InstrItins; - int LDSBankCount; - unsigned MaxPrivateElementSize; + int LDSBankCount = 0; + unsigned MaxPrivateElementSize = 0; // Possibly statically set by tablegen, but may want to be overridden. - bool FastFMAF32; - bool FastDenormalF32; - bool HalfRate64Ops; - bool FullRate64Ops; + bool FastFMAF32 = false; + bool FastDenormalF32 = false; + bool HalfRate64Ops = false; + bool FullRate64Ops = false; // Dynamically set bits that enable features. - bool FlatForGlobal; - bool AutoWaitcntBeforeBarrier; - bool UnalignedScratchAccess; - bool UnalignedAccessMode; - bool HasApertureRegs; - bool SupportsXNACK; + bool FlatForGlobal = false; + bool AutoWaitcntBeforeBarrier = false; + bool UnalignedScratchAccess = false; + bool UnalignedAccessMode = false; + bool HasApertureRegs = false; + bool SupportsXNACK = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for XNACK. - bool EnableXNACK; + bool EnableXNACK = false; - bool EnableTgSplit; - bool EnableCuMode; - bool TrapHandler; + bool EnableTgSplit = false; + bool EnableCuMode = false; + bool TrapHandler = false; // Used as options. - bool EnableLoadStoreOpt; - bool EnableUnsafeDSOffsetFolding; - bool EnableSIScheduler; - bool EnableDS128; - bool EnablePRTStrictNull; - bool DumpCode; + bool EnableLoadStoreOpt = false; + bool EnableUnsafeDSOffsetFolding = false; + bool EnableSIScheduler = false; + bool EnableDS128 = false; + bool EnablePRTStrictNull = false; + bool DumpCode = false; // Subtarget statically properties set by tablegen - bool FP64; - bool FMA; - bool MIMG_R128; - bool CIInsts; - bool GFX8Insts; - bool GFX9Insts; - bool GFX90AInsts; - bool GFX10Insts; - bool GFX10_3Insts; - bool GFX7GFX8GFX9Insts; - bool SGPRInitBug; - bool NegativeScratchOffsetBug; - bool NegativeUnalignedScratchOffsetBug; - bool HasSMemRealTime; - bool HasIntClamp; - bool HasFmaMixInsts; - bool HasMovrel; - bool HasVGPRIndexMode; - bool HasScalarStores; - bool HasScalarAtomics; - bool HasSDWAOmod; - bool HasSDWAScalar; - bool HasSDWASdst; - bool HasSDWAMac; - bool HasSDWAOutModsVOPC; - bool HasDPP; - bool HasDPP8; - bool Has64BitDPP; - bool HasPackedFP32Ops; - bool HasExtendedImageInsts; - bool HasR128A16; - bool HasGFX10A16; - bool HasG16; - bool HasNSAEncoding; - unsigned NSAMaxSize; - bool GFX10_AEncoding; - bool GFX10_BEncoding; - bool HasDLInsts; - bool HasDot1Insts; - bool HasDot2Insts; - bool HasDot3Insts; - bool HasDot4Insts; - bool HasDot5Insts; - bool HasDot6Insts; - bool HasDot7Insts; - bool HasMAIInsts; - bool HasPkFmacF16Inst; - bool HasAtomicFaddInsts; - bool SupportsSRAMECC; + bool FP64 = false; + bool FMA = false; + bool MIMG_R128 = false; + bool CIInsts = false; + bool GFX8Insts = false; + bool GFX9Insts = false; + bool GFX90AInsts = false; + bool GFX940Insts = false; + bool GFX10Insts = false; + bool GFX11Insts = false; + bool GFX10_3Insts = false; + bool GFX7GFX8GFX9Insts = false; + bool SGPRInitBug = false; + bool UserSGPRInit16Bug = false; + bool NegativeScratchOffsetBug = false; + bool NegativeUnalignedScratchOffsetBug = false; + bool HasSMemRealTime = false; + bool HasIntClamp = false; + bool HasFmaMixInsts = false; + bool HasMovrel = false; + bool HasVGPRIndexMode = false; + bool HasScalarStores = false; + bool HasScalarAtomics = false; + bool HasSDWAOmod = false; + bool HasSDWAScalar = false; + bool HasSDWASdst = false; + bool HasSDWAMac = false; + bool HasSDWAOutModsVOPC = false; + bool HasDPP = false; + bool HasDPP8 = false; + bool Has64BitDPP = false; + bool HasPackedFP32Ops = false; + bool HasImageInsts = false; + bool HasExtendedImageInsts = false; + bool HasR128A16 = false; + bool HasGFX10A16 = false; + bool HasG16 = false; + bool HasNSAEncoding = false; + unsigned NSAMaxSize = 0; + bool GFX10_AEncoding = false; + bool GFX10_BEncoding = false; + bool HasDLInsts = false; + bool HasDot1Insts = false; + bool HasDot2Insts = false; + bool HasDot3Insts = false; + bool HasDot4Insts = false; + bool HasDot5Insts = false; + bool HasDot6Insts = false; + bool HasDot7Insts = false; + bool HasDot8Insts = false; + bool HasMAIInsts = false; + bool HasPkFmacF16Inst = false; + bool HasAtomicFaddRtnInsts = false; + bool HasAtomicFaddNoRtnInsts = false; + bool HasAtomicPkFaddNoRtnInsts = false; + bool SupportsSRAMECC = false; // This should not be used directly. 'TargetID' tracks the dynamic settings // for SRAMECC. - bool EnableSRAMECC; + bool EnableSRAMECC = false; - bool HasNoSdstCMPX; - bool HasVscnt; - bool HasGetWaveIdInst; - bool HasSMemTimeInst; - bool HasShaderCyclesRegister; - bool HasVOP3Literal; - bool HasNoDataDepHazard; - bool FlatAddressSpace; - bool FlatInstOffsets; - bool FlatGlobalInsts; - bool FlatScratchInsts; - bool ScalarFlatScratchInsts; - bool HasArchitectedFlatScratch; - bool AddNoCarryInsts; - bool HasUnpackedD16VMem; - bool LDSMisalignedBug; - bool HasMFMAInlineLiteralBug; - bool UnalignedBufferAccess; - bool UnalignedDSAccess; - bool HasPackedTID; - bool ScalarizeGlobal; + bool HasNoSdstCMPX = false; + bool HasVscnt = false; + bool HasGetWaveIdInst = false; + bool HasSMemTimeInst = false; + bool HasShaderCyclesRegister = false; + bool HasVOP3Literal = false; + bool HasNoDataDepHazard = false; + bool FlatAddressSpace = false; + bool FlatInstOffsets = false; + bool FlatGlobalInsts = false; + bool FlatScratchInsts = false; + bool ScalarFlatScratchInsts = false; + bool HasArchitectedFlatScratch = false; + bool EnableFlatScratch = false; + bool AddNoCarryInsts = false; + bool HasUnpackedD16VMem = false; + bool LDSMisalignedBug = false; + bool HasMFMAInlineLiteralBug = false; + bool UnalignedBufferAccess = false; + bool UnalignedDSAccess = false; + bool HasPackedTID = false; + bool ScalarizeGlobal = false; - bool HasVcmpxPermlaneHazard; - bool HasVMEMtoScalarWriteHazard; - bool HasSMEMtoVectorWriteHazard; - bool HasInstFwdPrefetchBug; - bool HasVcmpxExecWARHazard; - bool HasLdsBranchVmemWARHazard; - bool HasNSAtoVMEMBug; - bool HasNSAClauseBug; - bool HasOffset3fBug; - bool HasFlatSegmentOffsetBug; - bool HasImageStoreD16Bug; - bool HasImageGather4D16Bug; + bool HasVcmpxPermlaneHazard = false; + bool HasVMEMtoScalarWriteHazard = false; + bool HasSMEMtoVectorWriteHazard = false; + bool HasInstFwdPrefetchBug = false; + bool HasVcmpxExecWARHazard = false; + bool HasLdsBranchVmemWARHazard = false; + bool HasNSAtoVMEMBug = false; + bool HasNSAClauseBug = false; + bool HasOffset3fBug = false; + bool HasFlatSegmentOffsetBug = false; + bool HasImageStoreD16Bug = false; + bool HasImageGather4D16Bug = false; + bool HasVOPDInsts = false; // Dummy feature to use for assembler in tablegen. - bool FeatureDisable; + bool FeatureDisable = false; SelectionDAGTargetInfo TSInfo; private: @@ -193,9 +202,6 @@ private: SIFrameLowering FrameLowering; public: - // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. - static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); - GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); ~GCNSubtarget() override; @@ -258,9 +264,19 @@ public: return (Generation)Gen; } + unsigned getMaxWaveScratchSize() const { + // See COMPUTE_TMPRING_SIZE.WAVESIZE. + if (getGeneration() < GFX11) { + // 13-bit field in units of 256-dword. + return (256 * 4) * ((1 << 13) - 1); + } + // 15-bit field in units of 64-dword. + return (64 * 4) * ((1 << 15) - 1); + } + /// Return the number of high bits known to be zero for a frame index. unsigned getKnownHighZeroBitsForFrameIndex() const { - return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); } int getLDSBankCount() const { @@ -558,13 +574,20 @@ public: // The ST addressing mode means no registers are used, either VGPR or SGPR, // but only immediate offset is swizzled and added to the FLAT scratch base. bool hasFlatScratchSTMode() const { - return hasFlatScratchInsts() && hasGFX10_3Insts(); + return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); } + bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } + bool hasScalarFlatScratchInsts() const { return ScalarFlatScratchInsts; } + bool enableFlatScratch() const { + return flatScratchIsArchitected() || + (EnableFlatScratch && hasFlatScratchInsts()); + } + bool hasGlobalAddTidInsts() const { return GFX10_BEncoding; } @@ -690,6 +713,10 @@ public: return HasDot7Insts; } + bool hasDot8Insts() const { + return HasDot8Insts; + } + bool hasMAIInsts() const { return HasMAIInsts; } @@ -699,9 +726,15 @@ public: } bool hasAtomicFaddInsts() const { - return HasAtomicFaddInsts; + return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; } + bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } + + bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } + + bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } + bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } @@ -765,8 +798,6 @@ public: return true; } - bool enableFlatScratch() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; @@ -805,6 +836,9 @@ public: /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } + /// \returns true if the subtarget has the v_permlane64_b32 instruction. + bool hasPermLane64() const { return getGeneration() >= GFX11; } + bool hasDPP() const { return HasDPP; } @@ -830,7 +864,11 @@ public: } bool hasFmaakFmamkF32Insts() const { - return getGeneration() >= GFX10; + return getGeneration() >= GFX10 || hasGFX940Insts(); + } + + bool hasImageInsts() const { + return HasImageInsts; } bool hasExtendedImageInsts() const { @@ -875,6 +913,10 @@ public: bool hasMadF16() const; + bool hasMovB64() const { return GFX940Insts; } + + bool hasLshlAddB64() const { return GFX940Insts; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -887,6 +929,10 @@ public: return SGPRInitBug; } + bool hasUserSGPRInit16Bug() const { + return UserSGPRInit16Bug; + } + bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } bool hasNegativeUnalignedScratchOffsetBug() const { @@ -915,6 +961,14 @@ public: getGeneration() <= AMDGPUSubtarget::GFX9; } + bool hasReadM0LdsDmaHazard() const { + return getGeneration() == AMDGPUSubtarget::GFX9; + } + + bool hasReadM0LdsDirectHazard() const { + return getGeneration() == AMDGPUSubtarget::GFX9; + } + bool hasVcmpxPermlaneHazard() const { return HasVcmpxPermlaneHazard; } @@ -943,6 +997,22 @@ public: return HasLdsBranchVmemWARHazard; } + // Has one cycle hazard on transcendental instruction feeding a + // non transcendental VALU. + bool hasTransForwardingHazard() const { return GFX940Insts; } + + // Has one cycle hazard on a VALU instruction partially writing dst with + // a shift of result bits feeding another VALU instruction. + bool hasDstSelForwardingHazard() const { return GFX940Insts; } + + // Cannot use op_sel with v_dot instructions. + bool hasDOTOpSelHazard() const { return GFX940Insts; } + + // Does not have HW interlocs for VALU writing and then reading SGPRs. + bool hasVDecCoExecHazard() const { + return GFX940Insts; + } + bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; } @@ -953,11 +1023,43 @@ public: bool hasGFX90AInsts() const { return GFX90AInsts; } + bool hasVOP3DPP() const { return getGeneration() >= GFX11; } + + bool hasLdsDirect() const { return getGeneration() >= GFX11; } + + bool hasVALUPartialForwardingHazard() const { + return getGeneration() >= GFX11; + } + + bool hasVALUTransUseHazard() const { return getGeneration() >= GFX11; } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts; } + /// Return true if the target has the S_PACK_HL_B32_B16 instruction. + bool hasSPackHL() const { return GFX11Insts; } + + /// Return true if the target's EXP instruction has the COMPR flag, which + /// affects the meaning of the EN (enable) bits. + bool hasCompressedExport() const { return !GFX11Insts; } + + /// Return true if the target's EXP instruction supports the NULL export + /// target. + bool hasNullExportTarget() const { return !GFX11Insts; } + + bool hasVOPDInsts() const { return HasVOPDInsts; } + + bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } + + /// Return true if the target has the S_DELAY_ALU instruction. + bool hasDelayAlu() const { return GFX11Insts; } + bool hasPackedTID() const { return HasPackedTID; } + // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that + // hasGFX90AInsts is also true. + bool hasGFX940Insts() const { return GFX940Insts; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -989,6 +1091,9 @@ public: return getGeneration() >= GFX9; } + // \returns true if the target supports the pre-NGG legacy geometry path. + bool hasLegacyGeometry() const { return getGeneration() < GFX11; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1105,6 +1210,10 @@ public: /// unit requirement. unsigned getMaxNumVGPRs(const Function &F) const; + unsigned getMaxNumAGPRs(const Function &F) const { + return getMaxNumVGPRs(F); + } + /// \returns Maximum number of VGPRs that meets number of waves per execution /// unit requirement for function \p MF, or number of VGPRs explicitly /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. @@ -1165,6 +1274,10 @@ public: void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep) const override; + + // \returns true if it's beneficial on this subtarget for the scheduler to + // cluster stores as well as loads. + bool shouldClusterStores() const { return getGeneration() >= GFX11; } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td new file mode 100644 index 000000000000..1f65376890da --- /dev/null +++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td @@ -0,0 +1,116 @@ +//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LDSDIR encoding +//===----------------------------------------------------------------------===// + +class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { + // encoding fields + bits<2> attrchan; + bits<6> attr; + bits<4> waitvdst; + bits<8> vdst; + + // encoding + let Inst{31-24} = 0xce; // encoding + let Inst{23-22} = 0x0; // reserved + let Inst{21-20} = op; + let Inst{19-16} = waitvdst; + let Inst{15-10} = !if(is_direct, ?, attr); + let Inst{9-8} = !if(is_direct, ?, attrchan); + let Inst{7-0} = vdst; +} + +//===----------------------------------------------------------------------===// +// LDSDIR Classes +//===----------------------------------------------------------------------===// + +class LDSDIR_getIns<bit direct> { + dag ret = !if(direct, + (ins wait_vdst:$waitvdst), + (ins Attr:$attr, AttrChan:$attrchan, wait_vdst:$waitvdst) + ); +} + +class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI< + (outs VGPR_32:$vdst), + LDSDIR_getIns<direct>.ret, + asm> { + let LDSDIR = 1; + let EXP_CNT = 1; + + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; + + string Mnemonic = opName; + let UseNamedOperandTable = 1; + + let Uses = [M0, EXEC]; + let DisableWQM = 0; + let SchedRW = [WriteLDS]; + + bit is_direct; + let is_direct = direct; +} + +class LDSDIR_Pseudo<string opName, bit direct> : + LDSDIR_Common<opName, "", direct>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class LDSDIR_getAsm<bit direct> { + string ret = !if(direct, + " $vdst$waitvdst", + " $vdst, $attr$attrchan$waitvdst" + ); +} + +class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> : + LDSDIR_Common<lds.Mnemonic, + lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret, + lds.is_direct>, + SIMCInstr <lds.Mnemonic, subtarget>, + LDSDIRe<op, lds.is_direct> { + let isPseudo = 0; + let isCodeGenOnly = 0; +} + +//===----------------------------------------------------------------------===// +// LDS Direct Instructions +//===----------------------------------------------------------------------===// + +def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>; +def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>; + +def : GCNPat < + (f32 (int_amdgcn_lds_direct_load M0)), + (LDS_DIRECT_LOAD 0) +>; + +def : GCNPat < + (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), + (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) +>; + +//===----------------------------------------------------------------------===// +// GFX11+ +//===----------------------------------------------------------------------===// + +multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> { + def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> { + let AssemblerPredicate = isGFX11Plus; + let DecoderNamespace = "GFX11"; + } +} + +defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>; +defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>; diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index 912bcc792e4d..24c9cc2d7dd2 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -239,9 +239,9 @@ void AMDGPUCustomBehaviour::generateWaitCntInfo() { AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); InstrWaitCntInfo.resize(SrcMgr.size()); - int Index = 0; - for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { - const std::unique_ptr<Instruction> &Inst = *I; + for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) { + const std::unique_ptr<Instruction> &Inst = EN.value(); + unsigned Index = EN.index(); unsigned Opcode = Inst->getOpcode(); const MCInstrDesc &MCID = MCII.get(Opcode); if ((MCID.TSFlags & SIInstrFlags::DS) && diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index 56650515bd0a..7a0d454c3578 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -31,7 +31,7 @@ public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} - ~AMDGPUInstrPostProcess() {} + ~AMDGPUInstrPostProcess() = default; void postProcessInstruction(std::unique_ptr<Instruction> &Inst, const MCInst &MCI) override; @@ -86,7 +86,7 @@ public: AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII); - ~AMDGPUCustomBehaviour() {} + ~AMDGPUCustomBehaviour() = default; /// This method is used to determine if an instruction /// should be allowed to be dispatched. The return value is /// how many cycles until the instruction can be dispatched. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 50318a59225d..bda3c25e956b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -10,13 +10,16 @@ #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/EndianStream.h" +#include "llvm/Support/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -47,7 +50,10 @@ public: bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; + Optional<MCFixupKind> getFixupKind(StringRef Name) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; }; } //End anonymous namespace @@ -134,6 +140,9 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const { + if (Fixup.getKind() >= FirstLiteralRelocationKind) + return; + Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); if (!Value) return; // Doesn't change encoding. @@ -153,6 +162,15 @@ void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff); } +Optional<MCFixupKind> AMDGPUAsmBackend::getFixupKind(StringRef Name) const { + return StringSwitch<Optional<MCFixupKind>>(Name) +#define ELF_RELOC(Name, Value) \ + .Case(#Name, MCFixupKind(FirstLiteralRelocationKind + Value)) +#include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def" +#undef ELF_RELOC + .Default(None); +} + const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( MCFixupKind Kind) const { const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { @@ -160,12 +178,21 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, }; + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); return Infos[Kind - FirstTargetFixupKind]; } +bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &, + const MCFixup &Fixup, + const MCValue &) { + return Fixup.getKind() >= FirstLiteralRelocationKind; +} + unsigned AMDGPUAsmBackend::getMinimumNopSize() const { return 4; } @@ -236,5 +263,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(), - getHsaAbiVersion(&STI).getValueOr(0)); + getHsaAbiVersion(&STI).value_or(0)); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index bb2c298c2850..066b36622a16 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -65,7 +65,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_REL64; } - switch (Fixup.getKind()) { + MCFixupKind Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; + switch (Kind) { default: break; case FK_PCRel_4: return ELF::R_AMDGPU_REL32; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 76663b563150..bd938d829953 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -120,14 +120,6 @@ void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "addr64"); } -void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -152,7 +144,7 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, if (IsFlatSeg) { // Unsigned offset printU16ImmDecOperand(MI, OpNo, O); } else { // Signed offset - if (AMDGPU::isGFX10Plus(STI)) { + if (AMDGPU::isGFX10(STI)) { O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm())); } else { O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); @@ -191,6 +183,13 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm()); } +void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << " offset:"; + printSMEMOffset(MI, OpNo, STI, O); +} + void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -206,13 +205,15 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { auto Imm = MI->getOperand(OpNo).getImm(); if (Imm & CPol::GLC) - O << " glc"; + O << ((AMDGPU::isGFX940(STI) && + !(MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SMRD)) ? " sc0" + : " glc"); if (Imm & CPol::SLC) - O << " slc"; + O << (AMDGPU::isGFX940(STI) ? " nt" : " slc"); if ((Imm & CPol::DLC) && AMDGPU::isGFX10Plus(STI)) O << " dlc"; if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI)) - O << " scc"; + O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc"); if (Imm & ~CPol::ALL) O << " /* unexpected cache policy bit */"; } @@ -309,8 +310,8 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI, if (AMDGPU::isGFX10Plus(STI)) { if (Val == UFMT_DEFAULT) return; - if (isValidUnifiedFormat(Val)) { - O << " format:[" << getUnifiedFormatName(Val) << ']'; + if (isValidUnifiedFormat(Val, STI)) { + O << " format:[" << getUnifiedFormatName(Val, STI) << ']'; } else { O << " format:" << Val; } @@ -362,27 +363,26 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { auto Opcode = MI->getOpcode(); auto Flags = MII.get(Opcode).TSFlags; - if (OpNo == 0) { - if (Flags & SIInstrFlags::VOP3) { + if (Flags & SIInstrFlags::VOP3 && Flags & SIInstrFlags::DPP) + O << "_e64_dpp"; + else if (Flags & SIInstrFlags::VOP3) { if (!getVOP3IsSingle(Opcode)) O << "_e64"; - } else if (Flags & SIInstrFlags::DPP) { + } else if (Flags & SIInstrFlags::DPP) O << "_dpp"; - } else if (Flags & SIInstrFlags::SDWA) { + else if (Flags & SIInstrFlags::SDWA) O << "_sdwa"; - } else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) || - ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) { + else if (((Flags & SIInstrFlags::VOP1) && !getVOP1IsSingle(Opcode)) || + ((Flags & SIInstrFlags::VOP2) && !getVOP2IsSingle(Opcode))) O << "_e32"; - } O << " "; } - printOperand(MI, OpNo, STI, O); + printRegularOperand(MI, OpNo, STI, O); // Print default vcc/vcc_lo operand. switch (Opcode) { @@ -400,7 +400,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: - printDefaultVccOperand(1, STI, O); + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11: + printDefaultVccOperand(false, STI, O); break; } } @@ -412,7 +421,7 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, else O << "_e32 "; - printOperand(MI, OpNo, STI, O); + printRegularOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, @@ -533,7 +542,7 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) O << "0.15915494309189532"; else { - assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882); + assert(isUInt<32>(Imm) || isInt<32>(Imm)); // In rare situations, we will have a 32-bit literal in a 64-bit // operand. This is technically allowed for the encoding of s_mov_b64. @@ -548,6 +557,18 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo, if (!Imm) return; + if (AMDGPU::isGFX940(STI)) { + switch (MI->getOpcode()) { + case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_acd: + case AMDGPU::V_MFMA_F64_16X16X4F64_gfx940_vcd: + case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_acd: + case AMDGPU::V_MFMA_F64_4X4X4F64_gfx940_vcd: + O << " neg:[" << (Imm & 1) << ',' << ((Imm >> 1) & 1) << ',' + << ((Imm >> 2) & 1) << ']'; + return; + } + } + O << " blgp:" << Imm; } @@ -571,26 +592,73 @@ void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo, O << " abid:" << Imm; } -void AMDGPUInstPrinter::printDefaultVccOperand(unsigned OpNo, +void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI, raw_ostream &O) { - if (OpNo > 0) + if (!FirstOperand) O << ", "; - printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? - AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI); - if (OpNo == 0) + printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] + ? AMDGPU::VCC + : AMDGPU::VCC_LO, + O, MRI); + if (FirstOperand) O << ", "; } +void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint8_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " wait_vdst:"; + printU4ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint8_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " wait_exp:"; + printU4ImmDecOperand(MI, OpNo, O); + } +} + +bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc, + unsigned OpNo) const { + return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && + (Desc.TSFlags & SIInstrFlags::VOPC) && + (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || + Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)); +} + +// Print default vcc/vcc_lo operand of VOPC. void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - // Print default vcc/vcc_lo operand of VOPC. - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (OpNo == 0 && (Desc.TSFlags & SIInstrFlags::VOPC) && + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + // 0, 1 and 2 are the first printed operands in different cases + // If there are printed modifiers, printOperandAndFPInputMods or + // printOperandAndIntInputMods will be called instead + if ((OpNo == 0 || + (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) || + (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) && + (Desc.TSFlags & SIInstrFlags::VOPC) && (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) || Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO))) - printDefaultVccOperand(OpNo, STI, O); + printDefaultVccOperand(true, STI, O); + + printRegularOperand(MI, OpNo, STI, O); +} + +// Print operands after vcc or modifier handling. +void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); if (OpNo >= MI->getNumOperands()) { O << "/*Missing OP" << OpNo << "*/"; @@ -710,12 +778,24 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10: case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10: + case AMDGPU::V_CNDMASK_B32_e32_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_e32_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_e32_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx11: + case AMDGPU::V_CNDMASK_B32_dpp_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx11: + case AMDGPU::V_CNDMASK_B32_dpp8_gfx11: + case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11: + case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11: case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7: case AMDGPU::V_CNDMASK_B32_e32_vi: if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1)) - printDefaultVccOperand(OpNo, STI, O); + printDefaultVccOperand(OpNo == 0, STI, O); break; } @@ -732,6 +812,10 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (needsImpliedVcc(Desc, OpNo)) + printDefaultVccOperand(true, STI, O); + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); // Use 'neg(...)' instead of '-' to avoid ambiguity. @@ -754,7 +838,7 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, if (InputModifiers & SISrcMods::ABS) O << '|'; - printOperand(MI, OpNo + 1, STI, O); + printRegularOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::ABS) O << '|'; @@ -767,10 +851,14 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (needsImpliedVcc(Desc, OpNo)) + printDefaultVccOperand(true, STI, O); + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); if (InputModifiers & SISrcMods::SEXT) O << "sext("; - printOperand(MI, OpNo + 1, STI, O); + printRegularOperand(MI, OpNo + 1, STI, O); if (InputModifiers & SISrcMods::SEXT) O << ')'; @@ -784,7 +872,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10: if ((int)OpNo + 1 == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1)) - printDefaultVccOperand(OpNo, STI, O); + printDefaultVccOperand(OpNo == 0, STI, O); break; } } @@ -1203,9 +1291,9 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printOperand(MI, OpNo, STI, O); + printRegularOperand(MI, OpNo, STI, O); O << ", "; - printOperand(MI, OpNo + 1, STI, O); + printRegularOperand(MI, OpNo + 1, STI, O); } void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, @@ -1262,15 +1350,16 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, uint16_t MsgId; uint16_t OpId; uint16_t StreamId; - decodeMsg(Imm16, MsgId, OpId, StreamId); + decodeMsg(Imm16, MsgId, OpId, StreamId, STI); + + StringRef MsgName = getMsgName(MsgId, STI); - if (isValidMsgId(MsgId, STI) && - isValidMsgOp(MsgId, OpId, STI) && + if (!MsgName.empty() && isValidMsgOp(MsgId, OpId, STI) && isValidMsgStream(MsgId, OpId, StreamId, STI)) { - O << "sendmsg(" << getMsgName(MsgId); - if (msgRequiresOp(MsgId)) { - O << ", " << getMsgOpName(MsgId, OpId); - if (msgSupportsStream(MsgId, OpId)) { + O << "sendmsg(" << MsgName; + if (msgRequiresOp(MsgId, STI)) { + O << ", " << getMsgOpName(MsgId, OpId, STI); + if (msgSupportsStream(MsgId, OpId, STI)) { O << ", " << StreamId; } } @@ -1423,6 +1512,76 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::DepCtr; + + uint64_t Imm16 = MI->getOperand(OpNo).getImm() & 0xffff; + + bool HasNonDefaultVal = false; + if (isSymbolicDepCtrEncoding(Imm16, HasNonDefaultVal, STI)) { + int Id = 0; + StringRef Name; + unsigned Val; + bool IsDefault; + bool NeedSpace = false; + while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) { + if (!IsDefault || !HasNonDefaultVal) { + if (NeedSpace) + O << ' '; + O << Name << '(' << Val << ')'; + NeedSpace = true; + } + } + } else { + O << formatHex(Imm16); + } +} + +void AMDGPUInstPrinter::printDelayFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const char *BadInstId = "/* invalid instid value */"; + static const std::array<const char *, 12> InstIds = { + "NO_DEP", "VALU_DEP_1", "VALU_DEP_2", + "VALU_DEP_3", "VALU_DEP_4", "TRANS32_DEP_1", + "TRANS32_DEP_2", "TRANS32_DEP_3", "FMA_ACCUM_CYCLE_1", + "SALU_CYCLE_1", "SALU_CYCLE_2", "SALU_CYCLE_3"}; + + const char *BadInstSkip = "/* invalid instskip value */"; + static const std::array<const char *, 6> InstSkips = { + "SAME", "NEXT", "SKIP_1", "SKIP_2", "SKIP_3", "SKIP_4"}; + + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + const char *Prefix = ""; + + unsigned Value = SImm16 & 0xF; + if (Value) { + const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId; + O << Prefix << "instid0(" << Name << ')'; + Prefix = " | "; + } + + Value = (SImm16 >> 4) & 7; + if (Value) { + const char *Name = + Value < InstSkips.size() ? InstSkips[Value] : BadInstSkip; + O << Prefix << "instskip(" << Name << ')'; + Prefix = " | "; + } + + Value = (SImm16 >> 7) & 0xF; + if (Value) { + const char *Name = Value < InstIds.size() ? InstIds[Value] : BadInstId; + O << Prefix << "instid1(" << Name << ')'; + Prefix = " | "; + } + + if (!*Prefix) + O << "0"; +} + void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Id; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 71db0beba0b6..202edeee3cb3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -15,6 +15,7 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { +class MCInstrDesc; class AMDGPUInstPrinter : public MCInstPrinter { public: @@ -50,7 +51,6 @@ private: void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -64,6 +64,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printSMEMOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -116,6 +118,8 @@ private: raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printRegularOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { printOperand(MI, OpNum, STI, O); @@ -172,8 +176,13 @@ private: raw_ostream &O); void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printDefaultVccOperand(unsigned OpNo, const MCSubtargetInfo &STI, + bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const; + void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI, raw_ostream &O); + void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, unsigned N); @@ -234,6 +243,10 @@ protected: raw_ostream &O); void printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printDepCtr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printDelayFlag(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printEndpgm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 53c724f2211a..02c213f90f89 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -14,8 +14,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#include "llvm/ADT/APInt.h" #include "llvm/MC/MCCodeEmitter.h" -#include <cstdint> namespace llvm { @@ -34,46 +34,34 @@ protected: AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {} public: + void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, + APInt &Inst, APInt &Scratch, + const MCSubtargetInfo &STI) const; - uint64_t getBinaryCodeForInstr(const MCInst &MI, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + virtual void getMachineOpValue(const MCInst &MI, const MCOperand &MO, + APInt &Op, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const = 0; - virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } + virtual void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const = 0; - virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + virtual void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } + const MCSubtargetInfo &STI) const = 0; - virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } + virtual void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const = 0; - virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + virtual void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } - - virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } + const MCSubtargetInfo &STI) const = 0; - virtual unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } + virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const = 0; protected: FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 1f917cd91b47..11fe3f9ef058 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -19,6 +19,7 @@ #include "R600InstPrinter.h" #include "R600MCTargetDesc.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCELFStreamer.h" @@ -27,6 +28,7 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index e5cce6045c8c..060d4b660632 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -33,7 +33,6 @@ enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 }; MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx); MCAsmBackend *createAMDGPUAsmBackend(const Target &T, @@ -51,7 +50,6 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM -#define GET_INSTRINFO_SCHED_ENUM #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 7aa5f1abf65b..078133469549 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -17,12 +17,16 @@ #include "Utils/AMDKernelCodeTUtils.h" #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -102,6 +106,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; @@ -112,6 +117,11 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: AK = GK_GFX1033; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: AK = GK_GFX1034; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: AK = GK_GFX1035; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036: AK = GK_GFX1036; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100: AK = GK_GFX1100; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: AK = GK_GFX1101; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: AK = GK_GFX1102; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } @@ -165,6 +175,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; + case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; @@ -175,6 +186,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1033: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033; case GK_GFX1034: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034; case GK_GFX1035: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035; + case GK_GFX1036: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036; + case GK_GFX1100: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100; + case GK_GFX1101: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101; + case GK_GFX1102: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102; + case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -285,7 +301,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { uint32_t Encoded_pad = Encoded_s_code_end; // Instruction cache line size in bytes. - const unsigned Log2CacheLineSize = 6; + const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6; const unsigned CacheLineSize = 1u << Log2CacheLineSize; // Extra padding amount in bytes to support prefetch mode 3. @@ -439,6 +455,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, @@ -515,8 +533,8 @@ void AMDGPUTargetELFStreamer::EmitNote( if (STI.getTargetTriple().getOS() == Triple::AMDHSA) NoteFlags = ELF::SHF_ALLOC; - S.PushSection(); - S.SwitchSection( + S.pushSection(); + S.switchSection( Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags)); S.emitInt32(NameSZ); // namesz S.emitValue(DescSZ, 4); // descz @@ -525,7 +543,7 @@ void AMDGPUTargetELFStreamer::EmitNote( S.emitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc S.emitValueToAlignment(4, 0, 1, 0); // padding 0 - S.PopSection(); + S.popSection(); } unsigned AMDGPUTargetELFStreamer::getEFlags() { @@ -691,7 +709,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major, OS.emitBytes(VendorName); OS.emitInt8(0); // NULL terminate VendorName OS.emitBytes(ArchName); - OS.emitInt8(0); // NULL terminte ArchName + OS.emitInt8(0); // NULL terminate ArchName }); } @@ -699,9 +717,9 @@ void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); - OS.PushSection(); + OS.pushSection(); OS.emitBytes(StringRef((const char*)&Header, sizeof(Header))); - OS.PopSection(); + OS.popSection(); } void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, @@ -806,7 +824,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { uint32_t Encoded_pad = Encoded_s_code_end; // Instruction cache line size in bytes. - const unsigned Log2CacheLineSize = 6; + const unsigned Log2CacheLineSize = AMDGPU::isGFX11Plus(STI) ? 7 : 6; const unsigned CacheLineSize = 1u << Log2CacheLineSize; // Extra padding amount in bytes to support prefetch mode 3. @@ -818,11 +836,11 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { } MCStreamer &OS = getStreamer(); - OS.PushSection(); + OS.pushSection(); OS.emitValueToAlignment(CacheLineSize, Encoded_pad, 4); for (unsigned I = 0; I < FillSize; I += 4) OS.emitInt32(Encoded_pad); - OS.PopSection(); + OS.popSection(); return true; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 6fe192e95e72..78eb304fe84f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -20,6 +20,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/EndianStream.h" @@ -84,9 +85,8 @@ enum FCInstr { }; MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx) { - return new R600MCCodeEmitter(MCII, MRI); + return new R600MCCodeEmitter(MCII, *Ctx.getRegisterInfo()); } void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h index fc52cb33824f..605ae851378d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h @@ -24,7 +24,6 @@ class MCInstrInfo; class MCRegisterInfo; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx); MCInstrInfo *createR600MCInstrInfo(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 77f219aaa3ab..5e67fb5ec876 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -17,10 +17,15 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" using namespace llvm; @@ -34,9 +39,8 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { const MCSubtargetInfo &STI) const; public: - SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, - MCContext &ctx) - : AMDGPUMCCodeEmitter(mcii), MRI(mri) {} + SIMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : AMDGPUMCCodeEmitter(mcii), MRI(*ctx.getRegisterInfo()) {} SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete; @@ -46,42 +50,45 @@ public: const MCSubtargetInfo &STI) const override; /// \returns the encoding for an MCOperand. - uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; /// Use a fixup to encode the simm16 field for SOPP branch /// instructions. - unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; - unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - unsigned getAVOperandEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; private: uint64_t getImplicitOpSelHiEncoding(int Opcode) const; + void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO, + unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; }; } // end anonymous namespace MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx) { - return new SIMCCodeEmitter(MCII, MRI, Ctx); + return new SIMCCodeEmitter(MCII, Ctx); } // Returns the encoding value to use if the given integer is an integer inline @@ -309,8 +316,9 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, computeAvailableFeatures(STI.getFeatureBits())); int Opcode = MI.getOpcode(); - uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); - const MCInstrDesc &Desc = MCII.get(Opcode); + APInt Encoding, Scratch; + getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); unsigned bytes = Desc.getSize(); // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. @@ -322,7 +330,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } for (unsigned i = 0; i < bytes; i++) { - OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); } // NSA encoding. @@ -335,9 +343,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned NumExtraAddrs = srsrc - vaddr0 - 1; unsigned NumPadding = (-NumExtraAddrs) & 3; - for (unsigned i = 0; i < NumExtraAddrs; ++i) - OS.write((uint8_t)getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), - Fixups, STI)); + for (unsigned i = 0; i < NumExtraAddrs; ++i) { + getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups, + STI); + OS.write((uint8_t)Encoding.getLimitedValue()); + } for (unsigned i = 0; i < NumPadding; ++i) OS.write(0); } @@ -385,34 +395,36 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } } -unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); if (MO.isExpr()) { const MCExpr *Expr = MO.getExpr(); MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); - return 0; + Op = APInt::getNullValue(96); + } else { + getMachineOpValue(MI, MO, Op, Fixups, STI); } - - return getMachineOpValue(MI, MO, Fixups, STI); } -unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { auto Offset = MI.getOperand(OpNo).getImm(); // VI only supports 20-bit unsigned offsets. assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset)); - return Offset; + Op = Offset; } -unsigned -SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; uint64_t RegEnc = 0; @@ -426,23 +438,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; } - return RegEnc; + Op = RegEnc; + return; } else { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); if (Enc != ~0U && Enc != 255) { - return Enc | SDWA9EncValues::SRC_SGPR_MASK; + Op = Enc | SDWA9EncValues::SRC_SGPR_MASK; + return; } } llvm_unreachable("Unsupported operand kind"); - return 0; } -unsigned -SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { using namespace AMDGPU::SDWA; uint64_t RegEnc = 0; @@ -455,13 +468,13 @@ SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; } - return RegEnc; + Op = RegEnc; } -unsigned -SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { unsigned Reg = MI.getOperand(OpNo).getReg(); uint64_t Enc = MRI.getEncodingValue(Reg); @@ -476,10 +489,11 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; - return Enc; + Op = Enc; } static bool needsPCRel(const MCExpr *Expr) { @@ -505,12 +519,21 @@ static bool needsPCRel(const MCExpr *Expr) { llvm_unreachable("invalid kind"); } -uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) - return MRI.getEncodingValue(MO.getReg()); +void SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()){ + Op = MRI.getEncodingValue(MO.getReg()); + return; + } + unsigned OpNo = &MO - MI.begin(); + getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); +} + +void SIMCCodeEmitter::getMachineOpValueCommon( + const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { // FIXME: If this is expression is PCRel or not should not depend on what @@ -533,28 +556,22 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, uint32_t Offset = Desc.getSize(); assert(Offset == 4 || Offset == 8); - Fixups.push_back( - MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); - } - - // Figure out the operand number, needed for isSrcOperand check - unsigned OpNo = 0; - for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { - if (&MO == &MI.getOperand(OpNo)) - break; + Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); } const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (AMDGPU::isSISrcOperand(Desc, OpNo)) { uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); - if (Enc != ~0U) - return Enc; - - } else if (MO.isImm()) - return MO.getImm(); + if (Enc != ~0U) { + Op = Enc; + return; + } + } else if (MO.isImm()) { + Op = MO.getImm(); + return; + } llvm_unreachable("Encoding of this operand type is not supported yet."); - return 0; } #define ENABLE_INSTR_PREDICATE_VERIFIER diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index cf03fd682143..be1addf35012 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -14,6 +14,8 @@ // - MIMGEncGfx90a: encoding for gfx90a for atomics // - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding // - MIMGEncGfx10NSA: gfx10 NSA encoding +// - MIMGEncGfx11Default: gfx11 default (non-NSA) encoding +// - MIMGEncGfx11NSA: gfx11 NSA encoding class MIMGEncoding; def MIMGEncGfx6 : MIMGEncoding; @@ -21,6 +23,8 @@ def MIMGEncGfx8 : MIMGEncoding; def MIMGEncGfx90a : MIMGEncoding; def MIMGEncGfx10Default : MIMGEncoding; def MIMGEncGfx10NSA : MIMGEncoding; +def MIMGEncGfx11Default : MIMGEncoding; +def MIMGEncGfx11NSA : MIMGEncoding; def MIMGEncoding : GenericEnum { let FilterClass = "MIMGEncoding"; @@ -90,11 +94,13 @@ def MIMG { int NOP = -1; } -class mimgopc <int base, int vi = base, int si = base> { - field bits<8> BASE = base; // Opcode for all but atomics +class mimgopc <int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> { + field bits<8> GFX11 = gfx11; + field bits<8> GFX10M = gfx10m; // GFX10minus for all but atomics field bits<8> VI = vi; // VI is only used for atomic instructions field bits<8> SI = si; // SI is only used for atomic instructions - bit HAS_BASE = !ne(base, MIMG.NOP); + bit HAS_GFX11 = !ne(gfx11, MIMG.NOP); + bit HAS_GFX10M = !ne(gfx10m, MIMG.NOP); bit HAS_VI = !ne(vi, MIMG.NOP); bit HAS_SI = !ne(si, MIMG.NOP); } @@ -207,12 +213,16 @@ class MIMG <dag outs, string dns = ""> MIMGEncoding MIMGEncoding; bits<8> VDataDwords; bits<8> VAddrDwords; + + // If NSA is used this counts number of operands VAddrDwords is split into. + bits<8> VAddrOperands; } def MIMGInfoTable : GenericTable { let FilterClass = "MIMG"; let CppTypeName = "MIMGInfo"; - let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"]; + let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", + "VAddrDwords", "VAddrOperands"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; string TypeOf_MIMGEncoding = "MIMGEncoding"; @@ -227,11 +237,12 @@ def getMIMGInfo : SearchIndex { // This class used to use !foldl to memoize the AddrAsmNames list. // It turned out that that was much slower than using !filter. -class MIMGNSAHelper<int num_addrs> { +class MIMGNSAHelper<int num_addrs, + list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> { list<string> AddrAsmNames = !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], !lt(i, num_addrs)), "vaddr" # i); - dag AddrIns = !dag(ins, !foreach(arg, AddrAsmNames, VGPR_32), AddrAsmNames); + dag AddrIns = !dag(ins, addr_types, AddrAsmNames); string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]"; int NSA = !if(!le(num_addrs, 1), ?, @@ -247,6 +258,7 @@ class MIMG_gfx6789<bits<8> op, dag outs, string dns = ""> let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx6; + let VAddrOperands = 1; let d16 = !if(BaseOpcode.HasD16, ?, 0); } @@ -257,6 +269,7 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = ""> let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; + let VAddrOperands = 1; let d16 = !if(BaseOpcode.HasD16, ?, 0); } @@ -264,10 +277,11 @@ class MIMG_gfx90a<bits<8> op, dag outs, string dns = ""> // Base class of all non-NSA gfx10 MIMG instructions. class MIMG_gfx10<int op, dag outs, string dns = ""> : MIMG<outs, dns>, MIMGe_gfx10<op> { - let SubtargetPredicate = isGFX10Plus; - let AssemblerPredicate = isGFX10Plus; + let SubtargetPredicate = isGFX10Only; + let AssemblerPredicate = isGFX10Only; let MIMGEncoding = MIMGEncGfx10Default; + let VAddrOperands = 1; let d16 = !if(BaseOpcode.HasD16, ?, 0); let nsa = 0; @@ -277,10 +291,11 @@ class MIMG_gfx10<int op, dag outs, string dns = ""> // Note that 1-dword addresses always use non-NSA variants. class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> : MIMG<outs, dns>, MIMGe_gfx10<op> { - let SubtargetPredicate = isGFX10Plus; - let AssemblerPredicate = isGFX10Plus; + let SubtargetPredicate = isGFX10Only; + let AssemblerPredicate = isGFX10Only; let MIMGEncoding = MIMGEncGfx10NSA; + let VAddrOperands = num_addrs; MIMGNSAHelper nsah = MIMGNSAHelper<num_addrs>; dag AddrIns = nsah.AddrIns; @@ -290,11 +305,45 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns=""> let nsa = nsah.NSA; } +// Base class of all non-NSA gfx11 MIMG instructions. +class MIMG_gfx11<int op, dag outs, string dns = ""> + : MIMG<outs, dns>, MIMGe_gfx11<op> { + let SubtargetPredicate = isGFX11Plus; + let AssemblerPredicate = isGFX11Plus; + + let MIMGEncoding = MIMGEncGfx11Default; + let VAddrOperands = 1; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = 0; +} + +// Base class for all NSA MIMG instructions. +// Note that 1-dword addresses always use non-NSA variants. +class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", + list<RegisterClass> addr_types=[]> + : MIMG<outs, dns>, MIMGe_gfx11<op> { + let SubtargetPredicate = isGFX11Plus; + let AssemblerPredicate = isGFX11Plus; + + let MIMGEncoding = MIMGEncGfx11NSA; + let VAddrOperands = num_addrs; + + MIMGNSAHelper nsah = !if(!empty(addr_types), + MIMGNSAHelper<num_addrs>, + MIMGNSAHelper<num_addrs, addr_types>); + dag AddrIns = nsah.AddrIns; + string AddrAsm = nsah.AddrAsm; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); + let nsa = nsah.NSA; +} + class MIMG_NoSampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> { + : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -307,7 +356,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + : MIMG_gfx90a <op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), @@ -319,7 +368,7 @@ class MIMG_NoSampler_Helper_gfx90a <mimgopc op, string asm, class MIMG_NoSampler_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> { + : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), @@ -331,7 +380,32 @@ class MIMG_NoSampler_gfx10<mimgopc op, string opcode, class MIMG_NoSampler_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> { + : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, @@ -347,7 +421,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, bit ExtendedImageInst = 1> { let ssamp = 0 in { let VAddrDwords = 1 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, !if(enableDisasm, "AMDGPU", "")>; if !not(ExtendedImageInst) then @@ -356,30 +430,42 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, def _V1_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VGPR_32, !if(enableDisasm, "AMDGPU", "")>; } + if op.HAS_GFX11 then { + def _V1_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } } let VAddrDwords = 2 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; if !not(ExtendedImageInst) then def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_64>; def _V2_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_64>; def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 2>; } + if op.HAS_GFX11 then { + def _V2_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_64>; + def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>; + } } let VAddrDwords = 3 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; if !not(ExtendedImageInst) then def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_96>; def _V3_gfx10 : MIMG_NoSampler_gfx10<op, asm, dst_rc, VReg_96>; def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 3>; } + if op.HAS_GFX11 then { + def _V3_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_96>; + def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>; + } } let VAddrDwords = 4 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; if !not(ExtendedImageInst) then def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VReg_128>; @@ -387,6 +473,11 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm, def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10<op, asm, dst_rc, 4, !if(enableDisasm, "AMDGPU", "")>; } + if op.HAS_GFX11 then { + def _V4_gfx11 : MIMG_NoSampler_gfx11<op, asm, dst_rc, VReg_128>; + def _V4_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } } } } @@ -420,7 +511,7 @@ class MIMG_Store_Helper <mimgopc op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns = ""> - : MIMG_gfx6789<op.BASE, (outs), dns> { + : MIMG_gfx6789<op.GFX10M, (outs), dns> { let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -433,7 +524,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns = ""> - : MIMG_gfx90a<op.BASE, (outs), dns> { + : MIMG_gfx90a<op.GFX10M, (outs), dns> { let InOperandList = !con((ins getLdStRegisterOperand<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, @@ -446,7 +537,7 @@ class MIMG_Store_Helper_gfx90a <mimgopc op, string asm, class MIMG_Store_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op.BASE, (outs), dns> { + : MIMG_gfx10<op.GFX10M, (outs), dns> { let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), @@ -458,7 +549,33 @@ class MIMG_Store_gfx10<mimgopc op, string opcode, class MIMG_Store_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op.BASE, (outs), num_addrs, dns> { + : MIMG_nsa_gfx10<op.GFX10M, (outs), num_addrs, dns> { + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx11<op.GFX11, (outs), dns> { + let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_nsa_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx11<op.GFX11, (outs), num_addrs, dns> { let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, @@ -475,39 +592,57 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm, let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0, DisableWQM = 1, ssamp = 0 in { let VAddrDwords = 1 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, !if(enableDisasm, "AMDGPU", "")>; + let hasPostISelHook = 1 in def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32, !if(enableDisasm, "GFX90A", "")>; def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32, !if(enableDisasm, "AMDGPU", "")>; } + if op.HAS_GFX11 then { + def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + } } let VAddrDwords = 2 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>; def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>; def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>; } + if op.HAS_GFX11 then { + def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>; + def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>; + } } let VAddrDwords = 3 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>; def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>; def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>; } + if op.HAS_GFX11 then { + def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>; + def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>; + } } let VAddrDwords = 4 in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>; def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>; def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4, !if(enableDisasm, "AMDGPU", "")>; } + if op.HAS_GFX11 then { + def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>; + def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4, + !if(enableDisasm, "AMDGPU", "")>; + } } } } @@ -582,7 +717,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc, class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, bit enableDisasm = 0> - : MIMG_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), + : MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; @@ -596,7 +731,37 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode, class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, bit enableDisasm = 0> - : MIMG_nsa_gfx10<!cast<int>(op.BASE), (outs DataRC:$vdst), num_addrs, + : MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs, + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = !con((ins DataRC:$vdata), + AddrIns, + (ins SReg_256:$srsrc, DMask:$dmask, + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; +} + +class MIMG_Atomic_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + bit enableDisasm = 0> + : MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), + !if(enableDisasm, "AMDGPU", "")> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe); + let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; +} + +class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + bit enableDisasm = 0> + : MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; @@ -622,11 +787,15 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } if op.HAS_VI then { def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + let hasPostISelHook = 1 in def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; } - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; } + if op.HAS_GFX11 then { + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + } } let VAddrDwords = 2 in { if op.HAS_SI then { @@ -636,10 +805,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>; } - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; } + if op.HAS_GFX11 then { + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + } } let VAddrDwords = 3 in { if op.HAS_SI then { @@ -649,10 +822,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>; } - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; } + if op.HAS_GFX11 then { + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + } } let VAddrDwords = 4 in { if op.HAS_SI then { @@ -662,10 +839,14 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>; } - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; } + if op.HAS_GFX11 then { + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + } } } } @@ -691,7 +872,7 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> - : MIMG_gfx6789 <op.BASE, (outs dst_rc:$vdata), dns> { + : MIMG_gfx6789 <op.GFX10M, (outs dst_rc:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), @@ -702,7 +883,7 @@ class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc, class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, RegisterClass src_rc, string dns=""> - : MIMG_gfx90a<op.BASE, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { + : MIMG_gfx90a<op.GFX10M, (outs getLdStRegisterOperand<dst_rc>.ret:$vdata), dns> { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da), @@ -714,7 +895,7 @@ class MIMG_Sampler_gfx90a<mimgopc op, string asm, RegisterClass dst_rc, class MIMG_Sampler_gfx10<mimgopc op, string opcode, RegisterClass DataRC, RegisterClass AddrRC, string dns=""> - : MIMG_gfx10<op.BASE, (outs DataRC:$vdata), dns> { + : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), @@ -727,7 +908,34 @@ class MIMG_Sampler_gfx10<mimgopc op, string opcode, class MIMG_Sampler_nsa_gfx10<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, string dns=""> - : MIMG_nsa_gfx10<op.BASE, (outs DataRC:$vdata), num_addrs, dns> { + : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdata), num_addrs, dns> { + let InOperandList = !con(AddrIns, + (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, + Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm" + #"$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, RegisterClass AddrRC, + string dns=""> + : MIMG_gfx11<op.GFX11, (outs DataRC:$vdata), dns> { + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, + R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm" + #"$cpol$r128$a16$tfe$lwe" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode, + RegisterClass DataRC, int num_addrs, + string dns=""> + : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, @@ -823,7 +1031,7 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, bit ExtendedImageInst = 1> { foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in { let VAddrDwords = addr.NumWords in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V # addr.NumWords : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; @@ -835,16 +1043,26 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, : MIMG_Sampler_gfx10 <op, asm, dst_rc, addr.RegClass, !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; } + if op.HAS_GFX11 then { + def _V # addr.NumWords # _gfx11 + : MIMG_Sampler_gfx11 <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } } foreach addr = MIMG_Sampler_AddrSizes<sample>.NSAInstrs in { let VAddrDwords = addr.NumWords in { - if op.HAS_BASE then { + if op.HAS_GFX10M then { def _V # addr.NumWords # _nsa_gfx10 : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords, !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; } + if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then { + def _V # addr.NumWords # _nsa_gfx11 + : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } } } @@ -911,10 +1129,17 @@ class MIMG_IntersectRay_Helper<bit Is64, bit A16> { // when we only need 9, 11 or 12 depending on A16 field and ptr size. RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass; int VAddrDwords = !srl(RegClass.Size, 5); + + int gfx11_nsa_addrs = !if(A16, 4, 5); + RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32); + list<RegisterClass> gfx11_addr_types = + !if(A16, + [node_ptr_type, VGPR_32, VReg_96, VReg_96], + [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); } class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16> - : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> { + : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "AMDGPU"> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), !if(A16, (ins GFX10A16:$a16), (ins))); @@ -924,7 +1149,27 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, b } class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit A16> - : MIMG_nsa_gfx10<op.BASE, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> { + : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> { + let InOperandList = !con(nsah.AddrIns, + (ins SReg_128:$srsrc), + !if(A16, (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", ""); +} + +class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC, bit A16> + : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "AMDGPU"> { + + let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), + !if(A16, (ins GFX10A16:$a16), (ins))); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(A16, "$a16", ""); + + let nsa = 0; +} + +class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, + bit A16, list<RegisterClass> addr_types> + : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "AMDGPU", + addr_types> { let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc), !if(A16, (ins GFX10A16:$a16), (ins))); @@ -936,9 +1181,7 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> { def "" : MIMGBaseOpcode { let BVH = 1; } - let SubtargetPredicate = HasGFX10_AEncoding, - AssemblerPredicate = HasGFX10_AEncoding, - AsmMatchConverter = !if(A16, "cvtIntersectRay", ""), + let AsmMatchConverter = !if(A16, "cvtIntersectRay", ""), dmask = 0xf, unorm = 1, d16 = 0, @@ -955,142 +1198,183 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> { def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, A16> { let VAddrDwords = info.VAddrDwords; } + def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass, A16> { + let VAddrDwords = info.VAddrDwords; + } def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, A16> { let VAddrDwords = info.num_addrs; } + def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode, + info.gfx11_nsa_addrs, A16, + info.gfx11_addr_types> { + let VAddrDwords = info.num_addrs; + } + } +} + +multiclass MIMG_MSAA_Load <mimgopc op, string asm> { + def "" : MIMGBaseOpcode { + let HasD16 = 1; + let Gather4 = 1; /* for appropriate dmask handling */ + let MSAA = 1; + } + + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), + Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in { + let VDataDwords = 2 in + defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0>; /* packed D16 */ + let VDataDwords = 3 in + defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0>; /* packed D16 + tfe */ + let VDataDwords = 4 in + defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1>; + let VDataDwords = 5 in + defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0>; } } //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// -defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00>, "image_load", 1>; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01>, "image_load_mip", 1, 1>; -defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02>, "image_load_pck", 0>; -defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03>, "image_load_pck_sgn", 0>; -defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04>, "image_load_mip_pck", 0, 1>; -defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05>, "image_load_mip_pck_sgn", 0, 1>; -defm IMAGE_STORE : MIMG_Store <mimgopc<0x08>, "image_store", 1>; -defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x09>, "image_store_mip", 1, 1>; -defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x0a>, "image_store_pck", 0>; -defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x0b>, "image_store_mip_pck", 0, 1>; +let OtherPredicates = [HasImageInsts] in { -defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x0e>, "image_get_resinfo", 0, 1, 1>; +defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00>, "image_load", 1>; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01>, "image_load_mip", 1, 1>; +defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02>, "image_load_pck", 0>; +defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03>, "image_load_pck_sgn", 0>; +defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04>, "image_load_mip_pck", 0, 1>; +defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>; +defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x08>, "image_store", 1>; +defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x09>, "image_store_mip", 1, 1>; +defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x0a>, "image_store_pck", 0>; +defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x0b>, "image_store_mip_pck", 0, 1>; -defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0f, 0x10, 0x0f>, "image_atomic_swap">; -defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>; -defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x11, 0x12, 0x11>, "image_atomic_add">; -defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x12, 0x13, 0x12>, "image_atomic_sub">; -defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">; -defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x14>, "image_atomic_smin">; -defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x15>, "image_atomic_umin">; -defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x16>, "image_atomic_smax">; -defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x17>, "image_atomic_umax">; -defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x18>, "image_atomic_and">; -defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x19>, "image_atomic_or">; -defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x1a>, "image_atomic_xor">; -defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x1b>, "image_atomic_inc">; -defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x1c>, "image_atomic_dec">; -defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; -defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; -defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x0e>, "image_get_resinfo", 0, 1, 1>; -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x20>, AMDGPUSample>; +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x0c, 0x11, 0x12, 0x11>, "image_atomic_add">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub">; +defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">; +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x0e, 0x14>, "image_atomic_smin">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x0f, 0x15>, "image_atomic_umin">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x10, 0x16>, "image_atomic_smax">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x11, 0x17>, "image_atomic_umax">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x18>, "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x19>, "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x1a>, "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x15, 0x1b>, "image_atomic_inc">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x16, 0x1c>, "image_atomic_dec">; +defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; +defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; +defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; + +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x20>, AMDGPUSample>; let OtherPredicates = [HasExtendedImageInsts] in { -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x21>, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x22>, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x23>, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0xa2>, AMDGPUSample_d, 0, 1>; -defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0xa3>, AMDGPUSample_d_cl, 0, 1>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x24>, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x25>, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x26>, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x27>, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x28>, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x29>, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x2a>, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x2b>, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0xaa>, AMDGPUSample_c_d, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0xab>, AMDGPUSample_c_d_cl, 0, 1>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x2c>, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x2d>, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x2e>, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x2f>, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x30>, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x31>, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x32>, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x33>, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0xb2>, AMDGPUSample_d_o, 0, 1>; -defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xb3>, AMDGPUSample_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x34>, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x35>, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x36>, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x37>, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x38>, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x39>, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x3a>, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x3b>, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0xba>, AMDGPUSample_c_d_o, 0, 1>; -defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x3c>, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x3e>, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x3d>, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x3f>, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x40>, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x41>, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x44>, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x45>, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x46>, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x47>, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x48>, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x49>, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x4c>, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x4d>, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x4e>, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x4f>, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x50>, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<0x51>, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<0x54>, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<0x55>, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<0x56>, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x57>, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<0x58>, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<0x59>, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<0x5c>, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<0x5d>, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<0x5e>, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x5f>, AMDGPUSample_c_lz_o>; -//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x61>, ?>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x21>, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x22>, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x23>, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x24>, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x25>, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x26>, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x27>, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x28>, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x29>, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x2a>, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x2b>, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x2c>, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x2d>, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x2e>, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x2f>, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x30>, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x31>, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x32>, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x33>, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x34>, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x35>, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x36>, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x37>, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x38>, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x39>, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x3a>, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x3c>, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x3d>, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x3f>, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x40>, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x41>, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x44>, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x45>, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x46>, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x47>, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x48>, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x49>, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x4c>, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x4d>, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x4e>, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x4f>, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x50>, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x51>, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x54>, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x55>, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x57>, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x58>, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x5f>, AMDGPUSample_c_lz_o>; +//defm IMAGE_GATHER4H : MIMG_Gather_WQM <mimgopc<0x90, 0x61>, ?>; -defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">; +defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">; -defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<0x68>, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<0x69>, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<0x6a>, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<0x6b>, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<0x6c>, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<0x6d>, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<0x6e>, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<0x6f>, AMDGPUSample_c_cd_cl_o>; -defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<0xe8>, AMDGPUSample_cd, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<0xe9>, AMDGPUSample_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<0xea>, AMDGPUSample_c_cd, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<0xeb>, AMDGPUSample_c_cd_cl, 0, 1>; -defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<0xec>, AMDGPUSample_cd_o, 0, 1>; -defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xed>, AMDGPUSample_cd_cl_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<0xee>, AMDGPUSample_c_cd_o, 0, 1>; -defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x68>, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>; } // End OtherPredicates = [HasExtendedImageInsts] -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; -let SubtargetPredicate = HasGFX10_AEncoding in -defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>; +let OtherPredicates = [HasExtendedImageInsts,HasG16] in { +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0xa2>, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>; +} // End OtherPredicates = [HasExtendedImageInsts,HasG16] + +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>; + +let SubtargetPredicate = isGFX10Only, OtherPredicates = [HasGFX10_AEncoding] in +defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>; + +let OtherPredicates = [HasGFX10_AEncoding] in +defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, MIMG.NOP>, "image_msaa_load">; + +let OtherPredicates = [HasGFX10_AEncoding] in { +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>; +} // End OtherPredicates = [HasGFX10_AEncoding] -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 1>; +} // End let OtherPredicates = [HasImageInsts] /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h index 2b483ae63da9..5dfbf8f1ef95 100644 --- a/llvm/lib/Target/AMDGPU/R600.h +++ b/llvm/lib/Target/AMDGPU/R600.h @@ -26,7 +26,7 @@ FunctionPass *createR600EmitClauseMarkers(); FunctionPass *createR600ClauseMergePass(); FunctionPass *createR600Packetizer(); FunctionPass *createR600ControlFlowFinalizer(); -FunctionPass *createAMDGPUCFGStructurizerPass(); +FunctionPass *createR600MachineCFGStructurizerPass(); FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); ModulePass *createR600OpenCLImageTypeLoweringPass(); diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp index c19e3c41485e..afcb6b4d65f8 100644 --- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -111,7 +111,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { MCContext &Context = getObjFileLowering().getContext(); MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); + OutStreamer->switchSection(ConfigSection); EmitProgramInfoR600(MF); @@ -120,7 +120,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(CommentSection); + OutStreamer->switchSection(CommentSection); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 715fd69fc7ae..2b85df8ac6cf 100644 --- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative manner. /// This pass is merging consecutive CFAlus where applicable. /// It needs to be called after IfCvt for best results. //===----------------------------------------------------------------------===// @@ -15,6 +15,7 @@ #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600.h" #include "R600Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 8a48a67b829c..4bf38a3c6ceb 100644 --- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -16,6 +16,7 @@ #include "R600.h" #include "R600MachineFunctionInfo.h" #include "R600Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include <set> using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index b9ca7f928d56..ef67e5c937dc 100644 --- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -17,6 +17,7 @@ #include "R600.h" #include "R600Defines.h" #include "R600Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; @@ -327,9 +328,9 @@ char R600EmitClauseMarkers::ID = 0; } // end anonymous namespace INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) + "R600 Emit Clause Markers", false, false) INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) + "R600 Emit Clause Markers", false, false) FunctionPass *llvm::createR600EmitClauseMarkers() { return new R600EmitClauseMarkers(); diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 194879fef53c..ef2d049f9175 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -17,6 +17,8 @@ #include "R600.h" #include "R600Defines.h" #include "R600Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp index abd4086db62c..fd8cecab90da 100644 --- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -8,6 +8,7 @@ #include "R600FrameLowering.h" #include "R600Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index bd757e9e3d70..bf52f7830ad7 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -42,39 +42,26 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, computeRegisterProperties(Subtarget->getRegisterInfo()); // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } + for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(Op, VT, MVT::i1, Promote); + setLoadExtAction(Op, VT, MVT::i8, Custom); + setLoadExtAction(Op, VT, MVT::i16, Custom); + } // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. - setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32, + MVT::v2i1, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v4i32, + MVT::v4i1, Expand); - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::STORE, {MVT::i8, MVT::i32, MVT::v2i32, MVT::v4i32}, + Custom); setTruncStoreAction(MVT::i32, MVT::i8, Custom); setTruncStoreAction(MVT::i32, MVT::i16, Custom); @@ -96,55 +83,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); // Set condition code actions - setCondCodeAction(ISD::SETO, MVT::f32, Expand); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); - setCondCodeAction(ISD::SETLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + setCondCodeAction({ISD::SETO, ISD::SETUO, ISD::SETLT, ISD::SETLE, ISD::SETOLT, + ISD::SETOLE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGE, + ISD::SETUGT, ISD::SETULT, ISD::SETULE}, + MVT::f32, Expand); - setCondCodeAction(ISD::SETLE, MVT::i32, Expand); - setCondCodeAction(ISD::SETLT, MVT::i32, Expand); - setCondCodeAction(ISD::SETULE, MVT::i32, Expand); - setCondCodeAction(ISD::SETULT, MVT::i32, Expand); + setCondCodeAction({ISD::SETLE, ISD::SETLT, ISD::SETULE, ISD::SETULT}, + MVT::i32, Expand); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction({ISD::FCOS, ISD::FSIN}, MVT::f32, Custom); - setOperationAction(ISD::SETCC, MVT::v4i32, Expand); - setOperationAction(ISD::SETCC, MVT::v2i32, Expand); + setOperationAction(ISD::SETCC, {MVT::v4i32, MVT::v2i32}, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, {MVT::i32, MVT::f32}, Expand); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::FSUB, MVT::f32, Expand); - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, + MVT::f64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom); - setOperationAction(ISD::SETCC, MVT::i32, Expand); - setOperationAction(ISD::SETCC, MVT::f32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SETCC, {MVT::i32, MVT::f32}, Expand); + setOperationAction({ISD::FP_TO_UINT, ISD::FP_TO_SINT}, {MVT::i1, MVT::i64}, + Custom); - setOperationAction(ISD::SELECT, MVT::i32, Expand); - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + setOperationAction(ISD::SELECT, {MVT::i32, MVT::f32, MVT::v2i32, MVT::v4i32}, + Expand); // ADD, SUB overflow. // TODO: turn these into Legal? @@ -158,56 +124,43 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, if (!Subtarget->hasBFE()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i1, MVT::v4i1}, Expand); if (!Subtarget->hasBFE()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i8, MVT::v4i8}, Expand); if (!Subtarget->hasBFE()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v4i16}, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i32, MVT::v4i32}, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, + {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, + {MVT::v2i32, MVT::v2f32, MVT::v4i32, MVT::v4f32}, Custom); // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. - setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction({ISD::SHL_PARTS, ISD::SRL_PARTS, ISD::SRA_PARTS}, MVT::i32, + Custom); - if (!Subtarget->hasFMA()) { - setOperationAction(ISD::FMA, MVT::f32, Expand); - setOperationAction(ISD::FMA, MVT::f64, Expand); - } + if (!Subtarget->hasFMA()) + setOperationAction(ISD::FMA, {MVT::f32, MVT::f64}, Expand); // FIXME: May need no denormals check setOperationAction(ISD::FMAD, MVT::f32, Legal); - if (!Subtarget->hasBFI()) { + if (!Subtarget->hasBFI()) // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } + setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); if (!Subtarget->hasBCNT(32)) setOperationAction(ISD::CTPOP, MVT::i32, Expand); @@ -229,30 +182,22 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); - } + for (MVT VT : ScalarIntVTs) + setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, + Expand); // LLVM will expand these to atomic_cmp_swap(0) // and atomic_swap, respectively. - setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); + setOperationAction({ISD::ATOMIC_LOAD, ISD::ATOMIC_STORE}, MVT::i32, Expand); // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN}, MVT::Other, + Custom); setSchedulingPreference(Sched::Source); - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine({ISD::FP_ROUND, ISD::FP_TO_SINT, ISD::EXTRACT_VECTOR_ELT, + ISD::SELECT_CC, ISD::INSERT_VECTOR_ELT, ISD::LOAD}); } static inline bool isEOP(MachineBasicBlock::iterator I) { @@ -995,7 +940,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const /// LLVM generates byte-addressed pointers. For indirect addressing, we need to /// convert these pointers to a register index. Each register holds /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the -/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// \p StackWidth, which tells us how many of the 4 sub-registers will be used /// for indirect addressing. SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, @@ -1100,7 +1045,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); - // TODO: Contrary to the name of the functiom, + // TODO: Contrary to the name of the function, // it also handles sub i32 non-truncating stores (like i1) SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); @@ -1163,9 +1108,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), - MemVT, StoreNode->getAlignment(), - StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); + NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, + StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), + StoreNode->getAAInfo()); StoreNode = cast<StoreSDNode>(NewStore); } @@ -1417,7 +1362,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); SDValue NewLoad = DAG.getExtLoad( ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, - LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); + LoadNode->getAlign(), LoadNode->getMemOperand()->getFlags()); SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, DAG.getValueType(MemVT)); @@ -1610,7 +1555,7 @@ static SDValue CompactSwizzlableVector( if (NewBldVec[i].isUndef()) // We mask write here to teach later passes that the ith element of this // vector is undef. Thus we can use it to reduce 128 bits reg usage, - // break false dependencies and additionnaly make assembly easier to read. + // break false dependencies and additionally make assembly easier to read. RemapSwizzle[i] = 7; // SEL_MASK_WRITE if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { if (C->isZero()) { @@ -1714,7 +1659,7 @@ SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) return SDValue(); - if (LoadNode->getAlignment() < 4) + if (LoadNode->getAlign() < Align(4)) return SDValue(); int ConstantBlock = ConstantAddressBlock(Block); diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index aec8b1ae4837..d04ec6490aae 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -18,6 +18,7 @@ #include "R600Defines.h" #include "R600Subtarget.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineFrameInfo.h" using namespace llvm; @@ -1469,21 +1470,3 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, FlagOp.setImm(InstFlags); } } - -unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind( - unsigned Kind) const { - switch (Kind) { - case PseudoSourceValue::Stack: - case PseudoSourceValue::FixedStack: - return AMDGPUAS::PRIVATE_ADDRESS; - case PseudoSourceValue::ConstantPool: - case PseudoSourceValue::GOT: - case PseudoSourceValue::JumpTable: - case PseudoSourceValue::GlobalValueCallEntry: - case PseudoSourceValue::ExternalSymbolCallEntry: - case PseudoSourceValue::TargetCustom: - return AMDGPUAS::CONSTANT_ADDRESS; - } - - llvm_unreachable("Invalid pseudo source kind"); -} diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h index bc8a4786df77..f720e4656348 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -320,9 +320,6 @@ public: bool isRegisterLoad(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD; } - - unsigned getAddressSpaceForPseudoSourceKind( - unsigned Kind) const override; }; namespace R600 { diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index 1736c078eb83..0a96c643d9bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -1,4 +1,4 @@ -//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===// +//===- R600MachineCFGStructurizer.cpp - CFG Structurizer ------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,6 +10,7 @@ #include "R600.h" #include "R600RegisterInfo.h" #include "R600Subtarget.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunction.h" @@ -42,7 +43,7 @@ STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); namespace llvm { -void initializeAMDGPUCFGStructurizerPass(PassRegistry &); +void initializeR600MachineCFGStructurizerPass(PassRegistry &); } // end namespace llvm @@ -89,7 +90,7 @@ public: // //===----------------------------------------------------------------------===// -class AMDGPUCFGStructurizer : public MachineFunctionPass { +class R600MachineCFGStructurizer : public MachineFunctionPass { public: using MBBVector = SmallVector<MachineBasicBlock *, 32>; using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>; @@ -103,8 +104,8 @@ public: static char ID; - AMDGPUCFGStructurizer() : MachineFunctionPass(ID) { - initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); + R600MachineCFGStructurizer() : MachineFunctionPass(ID) { + initializeR600MachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); } StringRef getPassName() const override { @@ -317,16 +318,16 @@ private: } // end anonymous namespace -char AMDGPUCFGStructurizer::ID = 0; +char R600MachineCFGStructurizer::ID = 0; -int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { +int R600MachineCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) return INVALIDSCCNUM; return (*It).second->SccNum; } -MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) +MachineBasicBlock *R600MachineCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) const { LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); if (It == LLInfoMap.end()) @@ -334,7 +335,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) return (*It).second; } -bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { +bool R600MachineCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { MachineLoop *LoopRep = MLI->getLoopFor(MBB); if (!LoopRep) return false; @@ -342,14 +343,14 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { return MBB->isSuccessor(LoopHeader); } -bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { +bool R600MachineCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) return false; return (*It).second->IsRetired; } -bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { +bool R600MachineCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { MachineLoop *LoopRep = MLI->getLoopFor(MBB); while (LoopRep && LoopRep->getHeader() == MBB) { MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); @@ -362,7 +363,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { return false; } -AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( +R600MachineCFGStructurizer::PathToKind R600MachineCFGStructurizer::singlePathTo( MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, bool AllowSideEntry) const { assert(DstMBB); @@ -380,7 +381,7 @@ AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( return Not_SinglePath; } -int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, +int R600MachineCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, MBBVector::const_iterator E) const { int Count = 0; while (It != E) { @@ -391,7 +392,7 @@ int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, return Count; } -bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { +bool R600MachineCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { unsigned BlockSizeThreshold = 30; unsigned CloneInstrThreshold = 100; bool MultiplePreds = MBB && (MBB->pred_size() > 1); @@ -403,7 +404,7 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); } -void AMDGPUCFGStructurizer::reversePredicateSetter( +void R600MachineCFGStructurizer::reversePredicateSetter( MachineBasicBlock::iterator I, MachineBasicBlock &MBB) { assert(I.isValid() && "Expected valid iterator"); for (;; --I) { @@ -430,7 +431,7 @@ void AMDGPUCFGStructurizer::reversePredicateSetter( } } -void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, +void R600MachineCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, const DebugLoc &DL) { MachineInstr *MI = MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); @@ -439,7 +440,7 @@ void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, SHOWNEWINSTR(MI); } -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, +MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, const DebugLoc &DL) { MachineInstr *MI = @@ -452,7 +453,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, return MI; } -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( +MachineInstr *R600MachineCFGStructurizer::insertInstrBefore( MachineBasicBlock::iterator I, int NewOpcode) { MachineInstr *OldMI = &(*I); MachineBasicBlock *MBB = OldMI->getParent(); @@ -464,7 +465,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( return NewMBB; } -void AMDGPUCFGStructurizer::insertCondBranchBefore( +void R600MachineCFGStructurizer::insertCondBranchBefore( MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) { MachineInstr *OldMI = &(*I); MachineBasicBlock *MBB = OldMI->getParent(); @@ -477,7 +478,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore( //erase later oldInstr->eraseFromParent(); } -void AMDGPUCFGStructurizer::insertCondBranchBefore( +void R600MachineCFGStructurizer::insertCondBranchBefore( MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode, int RegNum, const DebugLoc &DL) { MachineFunction *MF = blk->getParent(); @@ -488,7 +489,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore( SHOWNEWINSTR(NewInstr); } -int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { +int R600MachineCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { switch(OldOpcode) { case R600::JUMP_COND: case R600::JUMP: return R600::IF_PREDICATE_SET; @@ -499,7 +500,7 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { return -1; } -int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { +int R600MachineCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { switch(OldOpcode) { case R600::JUMP_COND: case R600::JUMP: return R600::IF_PREDICATE_SET; @@ -510,7 +511,7 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { return -1; } -int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { +int R600MachineCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { switch(OldOpcode) { case R600::JUMP_COND: case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32; @@ -519,7 +520,7 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { return -1; } -int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { +int R600MachineCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { switch(OldOpcode) { case R600::JUMP_COND: case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32; @@ -528,17 +529,17 @@ int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { return -1; } -MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { +MachineBasicBlock *R600MachineCFGStructurizer::getTrueBranch(MachineInstr *MI) { return MI->getOperand(0).getMBB(); } -void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, +void R600MachineCFGStructurizer::setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB) { MI->getOperand(0).setMBB(MBB); } MachineBasicBlock * -AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, +R600MachineCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, MachineInstr *MI) { assert(MBB->succ_size() == 2); MachineBasicBlock *TrueBranch = getTrueBranch(MI); @@ -548,7 +549,7 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, return (*It == TrueBranch) ? *Next : *It; } -bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { +bool R600MachineCFGStructurizer::isCondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { case R600::JUMP_COND: case R600::BRANCH_COND_i32: @@ -559,7 +560,7 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { return false; } -bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { +bool R600MachineCFGStructurizer::isUncondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { case R600::JUMP: case R600::BRANCH: @@ -570,7 +571,7 @@ bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { return false; } -DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { +DebugLoc R600MachineCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { //get DebugLoc from the first MachineBasicBlock instruction with debug info DebugLoc DL; for (MachineInstr &MI : *MBB) @@ -579,7 +580,7 @@ DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { return DL; } -MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( +MachineInstr *R600MachineCFGStructurizer::getNormalBlockBranchInstr( MachineBasicBlock *MBB) { MachineBasicBlock::reverse_iterator It = MBB->rbegin(); MachineInstr *MI = &*It; @@ -588,7 +589,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( return nullptr; } -MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( +MachineInstr *R600MachineCFGStructurizer::getLoopendBlockBranchInstr( MachineBasicBlock *MBB) { for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); It != E; ++It) { @@ -604,7 +605,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( return nullptr; } -MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { +MachineInstr *R600MachineCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { MachineBasicBlock::reverse_iterator It = MBB->rbegin(); if (It != MBB->rend()) { MachineInstr *instr = &(*It); @@ -614,7 +615,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { return nullptr; } -bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { +bool R600MachineCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { MachineInstr *MI = getReturnInstr(MBB); bool IsReturn = MBB->succ_empty(); if (MI) @@ -625,13 +626,13 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { return IsReturn; } -void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, +void R600MachineCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) { for (MachineBasicBlock *Succ : SrcMBB->successors()) DstMBB->addSuccessor(Succ); // *iter's predecessor is also taken care of } -MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { +MachineBasicBlock *R600MachineCFGStructurizer::clone(MachineBasicBlock *MBB) { MachineFunction *Func = MBB->getParent(); MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); Func->push_back(NewMBB); //insert to function @@ -640,7 +641,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { return NewMBB; } -void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( +void R600MachineCFGStructurizer::replaceInstrUseOfBlockWith( MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk) { MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); @@ -649,7 +650,7 @@ void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( setTrueBranch(BranchMI, NewBlk); } -void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { +void R600MachineCFGStructurizer::wrapup(MachineBasicBlock *MBB) { assert((!MBB->getParent()->getJumpTableInfo() || MBB->getParent()->getJumpTableInfo()->isEmpty()) && "found a jump table"); @@ -677,12 +678,12 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { // blocks in the jump table with the entryBlk //} } -bool AMDGPUCFGStructurizer::prepare() { +bool R600MachineCFGStructurizer::prepare() { bool Changed = false; //FIXME: if not reducible flow graph, make it so ??? - LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); + LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::prepare\n";); orderBlocks(FuncRep); @@ -719,9 +720,9 @@ bool AMDGPUCFGStructurizer::prepare() { return Changed; } -bool AMDGPUCFGStructurizer::run() { +bool R600MachineCFGStructurizer::run() { //Assume reducible CFG... - LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); + LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::run\n"); #ifdef STRESSTEST //Use the worse block ordering to test the algorithm. @@ -740,6 +741,7 @@ bool AMDGPUCFGStructurizer::run() { ++NumIter; LLVM_DEBUG(dbgs() << "numIter = " << NumIter << ", numRemaintedBlk = " << NumRemainedBlk << "\n";); + (void)NumIter; SmallVectorImpl<MachineBasicBlock *>::const_iterator It = OrderedBlks.begin(); @@ -780,6 +782,7 @@ bool AMDGPUCFGStructurizer::run() { LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB) << ", sccNumIter = " << SccNumIter; dbgs() << "doesn't make any progress\n";); + (void)SccNumIter; ContNextScc = true; } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { SccNumBlk = sccRemainedNumBlk; @@ -842,7 +845,7 @@ bool AMDGPUCFGStructurizer::run() { return true; } -void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { +void R600MachineCFGStructurizer::orderBlocks(MachineFunction *MF) { int SccNum = 0; for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); ++It, ++SccNum) { @@ -861,7 +864,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { } } -int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { +int R600MachineCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { int NumMatch = 0; int CurMatch; @@ -876,7 +879,7 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { return NumMatch; } -int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { +int R600MachineCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { int NumMatch = 0; NumMatch += loopendPatternMatch(); NumMatch += serialPatternMatch(MBB); @@ -884,7 +887,7 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { return NumMatch; } -int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { +int R600MachineCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { if (MBB->succ_size() != 1) return 0; @@ -897,7 +900,7 @@ int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { return 1; } -int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { +int R600MachineCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { //two edges if (MBB->succ_size() != 2) return 0; @@ -975,7 +978,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { return 1 + Cloned + NumMatch; } -int AMDGPUCFGStructurizer::loopendPatternMatch() { +int R600MachineCFGStructurizer::loopendPatternMatch() { std::deque<MachineLoop *> NestedLoops; for (auto &It: *MLI) for (MachineLoop *ML : depth_first(It)) @@ -1000,7 +1003,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() { return Num; } -int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { +int R600MachineCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { MachineBasicBlock *LoopHeader = LoopRep->getHeader(); MBBVector ExitingMBBs; LoopRep->getExitingBlocks(ExitingMBBs); @@ -1041,7 +1044,7 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { return 1; } -bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( +bool R600MachineCFGStructurizer::isSameloopDetachedContbreak( MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { if (Src1MBB->succ_empty()) { MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); @@ -1058,7 +1061,7 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( return false; } -int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, +int R600MachineCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); if (Num == 0) { @@ -1069,7 +1072,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, return Num; } -int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, +int R600MachineCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { int Num = 0; MachineBasicBlock *DownBlk; @@ -1107,7 +1110,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, } #ifndef NDEBUG -void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( +void R600MachineCFGStructurizer::showImproveSimpleJumpintoIf( MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { dbgs() << "head = BB" << HeadMBB->getNumber() @@ -1150,7 +1153,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( } #endif -int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, +int R600MachineCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock **LandMBBPtr) { bool MigrateTrue = false; @@ -1322,7 +1325,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, return NumNewBlk; } -void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, +void R600MachineCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) { LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB" << SrcMBB->getNumber() << "\n";); @@ -1336,7 +1339,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, retireBlock(SrcMBB); } -void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, +void R600MachineCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { assert (TrueMBB); @@ -1392,7 +1395,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, MBB->addSuccessor(LandMBB); } -void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, +void R600MachineCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, MachineBasicBlock *LandMBB) { LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() << " land = BB" << LandMBB->getNumber() << "\n";); @@ -1402,7 +1405,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, DstBlk->replaceSuccessor(DstBlk, LandMBB); } -void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, +void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock *LandMBB) { LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() << " land = BB" @@ -1423,7 +1426,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, ExitingMBB->removeSuccessor(LandMBB, true); } -void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, +void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, MachineBasicBlock *ContMBB) { LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB" << ContingMBB->getNumber() << ", cont = BB" @@ -1466,7 +1469,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, } } -int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, +int R600MachineCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { int Cloned = 0; assert(PreMBB->isSuccessor(SrcMBB)); @@ -1485,10 +1488,9 @@ int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, } MachineBasicBlock * -AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, +R600MachineCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, MachineBasicBlock *PredMBB) { - assert(PredMBB->isSuccessor(MBB) && - "succBlk is not a prececessor of curBlk"); + assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk"); MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); @@ -1510,7 +1512,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, return CloneMBB; } -void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, +void R600MachineCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator SpliceEnd; //look for the input branchinstr, not the AMDGPU branchinstr @@ -1535,7 +1537,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, } MachineBasicBlock * -AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { +R600MachineCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { MachineBasicBlock *LoopHeader = LoopRep->getHeader(); MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); @@ -1555,7 +1557,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { return nullptr; } -void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { +void R600MachineCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { MachineInstr *BranchMI; // I saw two unconditional branch in one basic block in example @@ -1567,7 +1569,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { } } -void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( +void R600MachineCFGStructurizer::removeRedundantConditionalBranch( MachineBasicBlock *MBB) { if (MBB->succ_size() != 2) return; @@ -1584,7 +1586,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( MBB->removeSuccessor(MBB1, true); } -void AMDGPUCFGStructurizer::addDummyExitBlock( +void R600MachineCFGStructurizer::addDummyExitBlock( SmallVectorImpl<MachineBasicBlock*> &RetMBB) { MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); FuncRep->push_back(DummyExitBlk); //insert to function @@ -1600,12 +1602,12 @@ void AMDGPUCFGStructurizer::addDummyExitBlock( SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); } -void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { +void R600MachineCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { while (MBB->succ_size()) MBB->removeSuccessor(*MBB->succ_begin()); } -void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, +void R600MachineCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, int SccNum) { BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; if (!srcBlkInfo) @@ -1613,7 +1615,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, srcBlkInfo->SccNum = SccNum; } -void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { +void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";); BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; @@ -1625,14 +1627,14 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet"); } -INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", +INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", +INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) -FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { - return new AMDGPUCFGStructurizer(); +FunctionPass *llvm::createR600MachineCFGStructurizerPass() { + return new R600MachineCFGStructurizer(); } diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index fbe2a1cd9fba..59e274787590 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -207,7 +207,7 @@ public: return !ARDef || !ARUse; } - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // isLegalToPruneDependencies - Is it legal to prune dependency between SUI // and SUJ. bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { return false; diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp index 20c1ce7266dd..d8f061054904 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.cpp +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.cpp @@ -27,8 +27,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, : R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), AMDGPUSubtarget(TT), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - FMA(false), CaymanISA(false), CFALUBug(false), HasVertexCache(false), - R600ALUInst(false), FP64(false), TexVTXClauseSize(0), Gen(R600), TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), InstrItins(getInstrItineraryForCPU(GPU)) {} diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h index 92d559b1f8e6..c3d002f29272 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.h +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h @@ -31,14 +31,14 @@ class R600Subtarget final : public R600GenSubtargetInfo, private: R600InstrInfo InstrInfo; R600FrameLowering FrameLowering; - bool FMA; - bool CaymanISA; - bool CFALUBug; - bool HasVertexCache; - bool R600ALUInst; - bool FP64; - short TexVTXClauseSize; - Generation Gen; + bool FMA = false; + bool CaymanISA = false; + bool CFALUBug = false; + bool HasVertexCache = false; + bool R600ALUInst = false; + bool FP64 = false; + short TexVTXClauseSize = 0; + Generation Gen = R600; R600TargetLowering TLInfo; InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp index 39dad45425fc..76bb0f65ef69 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp @@ -83,7 +83,7 @@ R600TargetMachine::getSubtargetImpl(const Function &F) const { } TargetTransformInfo -R600TargetMachine::getTargetTransformInfo(const Function &F) { +R600TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(R600TTIImpl(this, F)); } @@ -131,7 +131,7 @@ void R600PassConfig::addPreSched2() { } void R600PassConfig::addPreEmitPass() { - addPass(createAMDGPUCFGStructurizerPass()); + addPass(createR600MachineCFGStructurizerPass()); addPass(createR600ExpandSpecialInstrsPass()); addPass(&FinalizeMachineBundlesID); addPass(createR600Packetizer()); diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h index 0ccbca3c68b1..8d20841292b9 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// The AMDGPU TargetMachine interface definition for hw codgen targets. +/// The AMDGPU TargetMachine interface definition for hw codegen targets. // //===----------------------------------------------------------------------===// @@ -38,7 +38,7 @@ public: const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override; - TargetTransformInfo getTargetTransformInfo(const Function &F) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; bool isMachineVerifierClean() const override { return false; } }; diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index b81fac36fc95..afd2a38b11ec 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -73,19 +73,19 @@ class SIAnnotateControlFlow : public FunctionPass { bool hasKill(const BasicBlock *BB); - void eraseIfUnused(PHINode *Phi); + bool eraseIfUnused(PHINode *Phi); - void openIf(BranchInst *Term); + bool openIf(BranchInst *Term); - void insertElse(BranchInst *Term); + bool insertElse(BranchInst *Term); Value * handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term); - void handleLoop(BranchInst *Term); + bool handleLoop(BranchInst *Term); - void closeControlFlow(BasicBlock *BB); + bool closeControlFlow(BasicBlock *BB); public: static char ID; @@ -193,31 +193,34 @@ bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) { return false; } -// Erase "Phi" if it is not used any more -void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (RecursivelyDeleteDeadPHINode(Phi)) { +// Erase "Phi" if it is not used any more. Return true if any change was made. +bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + bool Changed = RecursivelyDeleteDeadPHINode(Phi); + if (Changed) LLVM_DEBUG(dbgs() << "Erased unused condition phi\n"); - } + return Changed; } /// Open a new "If" block -void SIAnnotateControlFlow::openIf(BranchInst *Term) { +bool SIAnnotateControlFlow::openIf(BranchInst *Term) { if (isUniform(Term)) - return; + return false; Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); + return true; } /// Close the last "If" block and open a new "Else" block -void SIAnnotateControlFlow::insertElse(BranchInst *Term) { +bool SIAnnotateControlFlow::insertElse(BranchInst *Term) { if (isUniform(Term)) { - return; + return false; } Value *Ret = CallInst::Create(Else, popSaved(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); + return true; } /// Recursively handle the condition leading to a loop @@ -255,14 +258,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition( } /// Handle a back edge (loop) -void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { +bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { if (isUniform(Term)) - return; + return false; BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); if (!L) - return; + return false; BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front()); @@ -286,10 +289,12 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); push(Term->getSuccessor(0), Arg); + + return true; } /// Close the last opened control flow -void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { +bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); assert(Stack.back().first == BB); @@ -322,6 +327,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { } CallInst::Create(EndCf, Exec, "", FirstInsertionPt); } + + return true; } /// Annotate the control flow with intrinsics so the backend can @@ -333,6 +340,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); + bool Changed = false; initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F)); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { @@ -341,32 +349,32 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(BB)) - closeControlFlow(BB); + Changed |= closeControlFlow(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(BB)) - closeControlFlow(BB); + Changed |= closeControlFlow(BB); if (DT->dominates(Term->getSuccessor(1), BB)) - handleLoop(Term); + Changed |= handleLoop(Term); continue; } if (isTopOfStack(BB)) { PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) { - insertElse(Term); - eraseIfUnused(Phi); + Changed |= insertElse(Term); + Changed |= eraseIfUnused(Phi); continue; } - closeControlFlow(BB); + Changed |= closeControlFlow(BB); } - openIf(Term); + Changed |= openIf(Term); } if (!Stack.empty()) { @@ -374,7 +382,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { report_fatal_error("failed to annotate CFG"); } - return true; + return Changed; } /// Create the annotation pass diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 107ee5ed5532..85930312352b 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -63,6 +63,12 @@ enum : uint64_t { VGPRSpill = 1 << 24, SGPRSpill = 1 << 25, + // LDSDIR instruction format. + LDSDIR = 1 << 26, + + // VINTERP instruction format. + VINTERP = 1 << 27, + // High bits - other information. VM_CNT = UINT64_C(1) << 32, EXP_CNT = UINT64_C(1) << 33, @@ -120,7 +126,10 @@ enum : uint64_t { IsAtomicNoRet = UINT64_C(1) << 57, // Atomic with return. - IsAtomicRet = UINT64_C(1) << 58 + IsAtomicRet = UINT64_C(1) << 58, + + // Is a WMMA instruction. + IsWMMA = UINT64_C(1) << 59, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -258,9 +267,10 @@ namespace AMDGPUAsmVariants { VOP3 = 1, SDWA = 2, SDWA9 = 3, - DPP = 4 + DPP = 4, + VOP3_DPP = 5 }; -} +} // namespace AMDGPUAsmVariants namespace AMDGPU { namespace EncValues { // Encoding values of enum9/8/7 operands @@ -280,7 +290,8 @@ enum : unsigned { INLINE_FLOATING_C_MAX = 248, LITERAL_CONST = 255, VGPR_MIN = 256, - VGPR_MAX = 511 + VGPR_MAX = 511, + IS_VGPR = 256 // Indicates VGPR or AGPR }; } // namespace EncValues @@ -294,6 +305,9 @@ enum CPol { SLC = 2, DLC = 4, SCC = 16, + SC0 = GLC, + SC1 = SCC, + NT = SLC, ALL = GLC | SLC | DLC | SCC }; @@ -302,24 +316,33 @@ enum CPol { namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. enum Id { // Message ID, width(4) [3:0]. - ID_UNKNOWN_ = -1, ID_INTERRUPT = 1, - ID_GS = 2, - ID_GS_DONE = 3, - ID_SAVEWAVE = 4, // added in GFX8 + + ID_GS_PreGFX11 = 2, // replaced in GFX11 + ID_GS_DONE_PreGFX11 = 3, // replaced in GFX11 + + ID_HS_TESSFACTOR_GFX11Plus = 2, // reused in GFX11 + ID_DEALLOC_VGPRS_GFX11Plus = 3, // reused in GFX11 + + ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11 ID_STALL_WAVE_GEN = 5, // added in GFX9 ID_HALT_WAVES = 6, // added in GFX9 ID_ORDERED_PS_DONE = 7, // added in GFX9 ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 - ID_GET_DOORBELL = 10, // added in GFX9 - ID_GET_DDID = 11, // added in GFX10 + ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 + ID_GET_DDID = 11, // added in GFX10, removed in GFX11 ID_SYSMSG = 15, - ID_GAPS_LAST_, // Indicate that sequence has gaps. - ID_GAPS_FIRST_ = ID_INTERRUPT, - ID_SHIFT_ = 0, - ID_WIDTH_ = 4, - ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) + + ID_RTN_GET_DOORBELL = 128, + ID_RTN_GET_DDID = 129, + ID_RTN_GET_TMA = 130, + ID_RTN_GET_REALTIME = 131, + ID_RTN_SAVE_WAVE = 132, + ID_RTN_GET_TBA = 133, + + ID_MASK_PreGFX11_ = 0xF, + ID_MASK_GFX11Plus_ = 0xFF }; enum Op { // Both GS and SYS operation IDs. @@ -360,8 +383,6 @@ enum StreamId : unsigned { // Stream ID, (2) [9:8]. namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns. enum Id { // HwRegCode, (6) [5:0] - ID_UNKNOWN_ = -1, - ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined. ID_MODE = 1, ID_STATUS = 2, ID_TRAPSTS = 3, @@ -370,12 +391,15 @@ enum Id { // HwRegCode, (6) [5:0] ID_LDS_ALLOC = 6, ID_IB_STS = 7, ID_MEM_BASES = 15, - ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES, ID_TBA_LO = 16, - ID_SYMBOLIC_FIRST_GFX10_ = ID_TBA_LO, ID_TBA_HI = 17, ID_TMA_LO = 18, ID_TMA_HI = 19, + ID_XCC_ID = 20, + ID_SQ_PERF_SNAPSHOT_DATA = 21, + ID_SQ_PERF_SNAPSHOT_DATA1 = 22, + ID_SQ_PERF_SNAPSHOT_PC_LO = 23, + ID_SQ_PERF_SNAPSHOT_PC_HI = 24, ID_FLAT_SCR_LO = 20, ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, @@ -383,8 +407,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_SHADER_CYCLES = 29, - ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES, - ID_SYMBOLIC_LAST_ = 30, + ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -503,6 +526,15 @@ enum MergedFormat : int64_t { DFMT_NFMT_MAX = DFMT_NFMT_MASK }; +enum UnifiedFormatCommon : int64_t { + UFMT_MAX = 127, + UFMT_UNDEF = -1, + UFMT_DEFAULT = 1 +}; + +} // namespace MTBUFFormat + +namespace UfmtGFX10 { enum UnifiedFormat : int64_t { UFMT_INVALID = 0, @@ -598,14 +630,95 @@ enum UnifiedFormat : int64_t { UFMT_FIRST = UFMT_INVALID, UFMT_LAST = UFMT_32_32_32_32_FLOAT, +}; - UFMT_MAX = 127, +} // namespace UfmtGFX10 - UFMT_UNDEF = -1, - UFMT_DEFAULT = UFMT_8_UNORM +namespace UfmtGFX11 { +enum UnifiedFormat : int64_t { + UFMT_INVALID = 0, + + UFMT_8_UNORM, + UFMT_8_SNORM, + UFMT_8_USCALED, + UFMT_8_SSCALED, + UFMT_8_UINT, + UFMT_8_SINT, + + UFMT_16_UNORM, + UFMT_16_SNORM, + UFMT_16_USCALED, + UFMT_16_SSCALED, + UFMT_16_UINT, + UFMT_16_SINT, + UFMT_16_FLOAT, + + UFMT_8_8_UNORM, + UFMT_8_8_SNORM, + UFMT_8_8_USCALED, + UFMT_8_8_SSCALED, + UFMT_8_8_UINT, + UFMT_8_8_SINT, + + UFMT_32_UINT, + UFMT_32_SINT, + UFMT_32_FLOAT, + + UFMT_16_16_UNORM, + UFMT_16_16_SNORM, + UFMT_16_16_USCALED, + UFMT_16_16_SSCALED, + UFMT_16_16_UINT, + UFMT_16_16_SINT, + UFMT_16_16_FLOAT, + + UFMT_10_11_11_FLOAT, + + UFMT_11_11_10_FLOAT, + + UFMT_10_10_10_2_UNORM, + UFMT_10_10_10_2_SNORM, + UFMT_10_10_10_2_UINT, + UFMT_10_10_10_2_SINT, + + UFMT_2_10_10_10_UNORM, + UFMT_2_10_10_10_SNORM, + UFMT_2_10_10_10_USCALED, + UFMT_2_10_10_10_SSCALED, + UFMT_2_10_10_10_UINT, + UFMT_2_10_10_10_SINT, + + UFMT_8_8_8_8_UNORM, + UFMT_8_8_8_8_SNORM, + UFMT_8_8_8_8_USCALED, + UFMT_8_8_8_8_SSCALED, + UFMT_8_8_8_8_UINT, + UFMT_8_8_8_8_SINT, + + UFMT_32_32_UINT, + UFMT_32_32_SINT, + UFMT_32_32_FLOAT, + + UFMT_16_16_16_16_UNORM, + UFMT_16_16_16_16_SNORM, + UFMT_16_16_16_16_USCALED, + UFMT_16_16_16_16_SSCALED, + UFMT_16_16_16_16_UINT, + UFMT_16_16_16_16_SINT, + UFMT_16_16_16_16_FLOAT, + + UFMT_32_32_32_UINT, + UFMT_32_32_32_SINT, + UFMT_32_32_32_FLOAT, + UFMT_32_32_32_32_UINT, + UFMT_32_32_32_32_SINT, + UFMT_32_32_32_32_FLOAT, + + UFMT_FIRST = UFMT_INVALID, + UFMT_LAST = UFMT_32_32_32_32_FLOAT, }; -} // namespace MTBUFFormat +} // namespace UfmtGFX11 namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. @@ -746,20 +859,23 @@ enum Target : unsigned { ET_MRT0 = 0, ET_MRT7 = 7, ET_MRTZ = 8, - ET_NULL = 9, + ET_NULL = 9, // Pre-GFX11 ET_POS0 = 12, ET_POS3 = 15, - ET_POS4 = 16, // GFX10+ - ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget - ET_PRIM = 20, // GFX10+ - ET_PARAM0 = 32, - ET_PARAM31 = 63, + ET_POS4 = 16, // GFX10+ + ET_POS_LAST = ET_POS4, // Highest pos used on any subtarget + ET_PRIM = 20, // GFX10+ + ET_DUAL_SRC_BLEND0 = 21, // GFX11+ + ET_DUAL_SRC_BLEND1 = 22, // GFX11+ + ET_PARAM0 = 32, // Pre-GFX11 + ET_PARAM31 = 63, // Pre-GFX11 ET_NULL_MAX_IDX = 0, ET_MRTZ_MAX_IDX = 0, ET_PRIM_MAX_IDX = 0, ET_MRT_MAX_IDX = 7, ET_POS_MAX_IDX = 4, + ET_DUAL_SRC_BLEND_MAX_IDX = 1, ET_PARAM_MAX_IDX = 31, ET_INVALID = 255, @@ -777,6 +893,18 @@ enum OpSel : uint64_t { } // namespace VOP3PEncoding +namespace ImplicitArg { +// Implicit kernel argument offset for code object version 5. +enum Offset_COV5 : unsigned { + HOSTCALL_PTR_OFFSET = 80, + MULTIGRID_SYNC_ARG_OFFSET = 88, + HEAP_PTR_OFFSET = 96, + PRIVATE_BASE_OFFSET = 192, + SHARED_BASE_OFFSET = 196, + QUEUE_PTR_OFFSET = 200, +}; + +} // namespace ImplicitArg } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 @@ -911,10 +1039,12 @@ enum OpSel : uint64_t { #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 -#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 -#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12) +#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12) #define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 #define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 33954e11d6c6..99aa8a60b04f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -92,7 +92,7 @@ public: bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; - void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; const MachineOperand *isClamp(const MachineInstr &MI) const; bool tryFoldClamp(MachineInstr &MI); @@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::INSTRUCTION_LIST_END; } -// Wrapper around isInlineConstant that understands special cases when -// instruction types are replaced during operand folding. -static bool isInlineConstantIfFolded(const SIInstrInfo *TII, - const MachineInstr &UseMI, - unsigned OpNo, - const MachineOperand &OpToFold) { - if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) - return true; - - unsigned Opc = UseMI.getOpcode(); - unsigned NewOpc = macToMad(Opc); - if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { - // Special case for mac. Since this is replaced with mad when folded into - // src2, we need to check the legality for the final instruction. - int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (static_cast<int>(OpNo) == Src2Idx) { - const MCInstrDesc &MadDesc = TII->get(NewOpc); - return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); - } - } - - return false; -} - // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. static bool frameIndexMayFold(const SIInstrInfo *TII, @@ -210,6 +186,8 @@ static bool updateOperand(FoldCandidate &Fold, if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + (!ST.hasDOTOpSelHazard() || + !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) && AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is @@ -289,7 +267,7 @@ static bool updateOperand(FoldCandidate &Fold, // when looking at a use. Dst0.setReg(NewReg0); for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) - MI->RemoveOperand(I); + MI->removeOperand(I); MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); if (Fold.isCommuted()) @@ -490,6 +468,8 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_MOV_B64_e64: // Do not fold into an indirect mov. return !MI.hasRegisterImplicitUseOperand(AMDGPU::M0); } @@ -675,7 +655,9 @@ void SIFoldOperands::foldOperand( if (TII->isFLATScratch(*UseMI) && AMDGPU::getNamedOperandIdx(UseMI->getOpcode(), - AMDGPU::OpName::vaddr) != -1) { + AMDGPU::OpName::vaddr) != -1 && + AMDGPU::getNamedOperandIdx(UseMI->getOpcode(), + AMDGPU::OpName::saddr) == -1) { unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode()); UseMI->setDesc(TII->get(NewOpc)); } @@ -739,7 +721,7 @@ void SIFoldOperands::foldOperand( while (ImpOpI != ImpOpE) { MachineInstr::mop_iterator Tmp = ImpOpI; ImpOpI++; - UseMI->RemoveOperand(UseMI->getOperandNo(Tmp)); + UseMI->removeOperand(UseMI->getOperandNo(Tmp)); } CopiesToReplace.push_back(UseMI); } else { @@ -768,7 +750,7 @@ void SIFoldOperands::foldOperand( UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I) - UseMI->RemoveOperand(I); + UseMI->removeOperand(I); MachineInstrBuilder B(*MBB.getParent(), UseMI); DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; @@ -871,7 +853,7 @@ void SIFoldOperands::foldOperand( UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); else UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); - UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) return; } @@ -890,7 +872,7 @@ void SIFoldOperands::foldOperand( UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); - UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) return; } } @@ -906,6 +888,22 @@ void SIFoldOperands::foldOperand( } if (!FoldingImmLike) { + if (OpToFold.isReg() && ST->needsAlignedVGPRs()) { + // Don't fold if OpToFold doesn't hold an aligned register. + const TargetRegisterClass *RC = + TRI->getRegClassForReg(*MRI, OpToFold.getReg()); + if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) { + unsigned SubReg = OpToFold.getSubReg(); + const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); + RC = TRI->getCompatibleSubRegClass(RC, SubRC, SubReg); + if (RC) + RC = SubRC; + } + + if (!RC || !TRI->isProperlyAlignedRC(*RC)) + return; + } + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit @@ -1025,7 +1023,7 @@ static void stripExtraCopyOperands(MachineInstr &MI) { Desc.getNumImplicitDefs(); for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) - MI.RemoveOperand(I); + MI.removeOperand(I); } static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { @@ -1093,7 +1091,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, // Be careful to change the right operand, src0 may belong to a different // instruction. MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); - MI->RemoveOperand(Src1Idx); + MI->removeOperand(Src1Idx); mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); return true; } @@ -1112,11 +1110,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, Opc == AMDGPU::S_OR_B32) { if (Src1Val == 0) { // y = or x, 0 => y = copy x - MI->RemoveOperand(Src1Idx); + MI->removeOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); } else if (Src1Val == -1) { // y = or x, -1 => y = v_mov_b32 -1 - MI->RemoveOperand(Src1Idx); + MI->removeOperand(Src1Idx); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); } else return false; @@ -1129,11 +1127,11 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, MI->getOpcode() == AMDGPU::S_AND_B32) { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 - MI->RemoveOperand(Src0Idx); + MI->removeOperand(Src0Idx); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); } else if (Src1Val == -1) { // y = and x, -1 => y = copy x - MI->RemoveOperand(Src1Idx); + MI->removeOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); stripExtraCopyOperands(*MI); } else @@ -1147,7 +1145,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, MI->getOpcode() == AMDGPU::S_XOR_B32) { if (Src1Val == 0) { // y = xor x, 0 => y = copy x - MI->RemoveOperand(Src1Idx); + MI->removeOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); return true; } @@ -1185,12 +1183,12 @@ bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const { TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx != -1) - MI.RemoveOperand(Src2Idx); - MI.RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + MI.removeOperand(Src2Idx); + MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); if (Src1ModIdx != -1) - MI.RemoveOperand(Src1ModIdx); + MI.removeOperand(Src1ModIdx); if (Src0ModIdx != -1) - MI.RemoveOperand(Src0ModIdx); + MI.removeOperand(Src0ModIdx); mutateCopyOp(MI, NewDesc); LLVM_DEBUG(dbgs() << MI); return true; @@ -1217,7 +1215,7 @@ bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const { return false; } -void SIFoldOperands::foldInstOperand(MachineInstr &MI, +bool SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer @@ -1225,6 +1223,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector<MachineInstr *, 4> CopiesToReplace; SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); + bool Changed = false; if (OpToFold.isImm()) { for (auto &UseMI : @@ -1237,66 +1236,25 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // We may also encounter cases where one or both operands are // immediates materialized into a register, which would ordinarily not // be folded due to multiple uses or operand constraints. - if (tryConstantFoldOp(*MRI, TII, &UseMI)) + if (tryConstantFoldOp(*MRI, TII, &UseMI)) { LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); - } - } - - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - for (auto &Use : - make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { - MachineInstr *UseMI = Use.getParent(); - unsigned OpNo = UseMI->getOperandNo(&Use); - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &Use; - NonInlineUseOpNo = OpNo; - } + Changed = true; } } + } - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); - } - } else { - // Folding register. - SmallVector <MachineOperand *, 4> UsesToProcess; - for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) - UsesToProcess.push_back(&Use); - for (auto U : UsesToProcess) { - MachineInstr *UseMI = U->getParent(); - - foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), - FoldList, CopiesToReplace); - } + SmallVector<MachineOperand *, 4> UsesToProcess; + for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) + UsesToProcess.push_back(&Use); + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); + foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, + CopiesToReplace); } + if (CopiesToReplace.empty() && FoldList.empty()) + return Changed; + MachineFunction *MF = MI.getParent()->getParent(); // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) @@ -1328,6 +1286,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, TII->commuteInstruction(*Fold.UseMI, false); } } + return true; } // Clamp patterns are canonically selected to v_max_* instructions, so only @@ -1593,8 +1552,9 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - if (!TRI->isVectorSuperClass( - TRI->getRegClass(InstDesc.OpInfo[OpIdx].RegClass))) + const TargetRegisterClass *OpRC = + TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF()); + if (!OpRC || !TRI->isVectorSuperClass(OpRC)) return false; const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); @@ -1751,22 +1711,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); + bool Changed = false; for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineOperand *CurrentKnownM0Val = nullptr; for (auto &MI : make_early_inc_range(*MBB)) { - tryFoldCndMask(MI); + Changed |= tryFoldCndMask(MI); - if (tryFoldZeroHighBits(MI)) + if (tryFoldZeroHighBits(MI)) { + Changed = true; continue; + } - if (MI.isRegSequence() && tryFoldRegSequence(MI)) + if (MI.isRegSequence() && tryFoldRegSequence(MI)) { + Changed = true; continue; + } - if (MI.isPHI() && tryFoldLCSSAPhi(MI)) + if (MI.isPHI() && tryFoldLCSSAPhi(MI)) { + Changed = true; continue; + } - if (MI.mayLoad() && tryFoldLoad(MI)) + if (MI.mayLoad() && tryFoldLoad(MI)) { + Changed = true; continue; + } if (!TII->isFoldableCopy(MI)) { // Saw an unknown clobber of m0, so we no longer know what it is. @@ -1777,7 +1746,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // instruction, and not the omod multiply. if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) - tryFoldClamp(MI); + Changed |= tryFoldClamp(MI); continue; } @@ -1788,6 +1757,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineOperand &NewM0Val = MI.getOperand(1); if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { MI.eraseFromParent(); + Changed = true; continue; } @@ -1817,7 +1787,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!MI.getOperand(0).getReg().isVirtual()) continue; - foldInstOperand(MI, OpToFold); + Changed |= foldInstOperand(MI, OpToFold); // If we managed to fold all uses of this copy then we might as well // delete it now. @@ -1829,6 +1799,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { auto &SrcOp = InstToErase->getOperand(1); auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); InstToErase->eraseFromParent(); + Changed = true; InstToErase = nullptr; if (!SrcReg || SrcReg.isPhysical()) break; @@ -1837,9 +1808,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { break; } if (InstToErase && InstToErase->isRegSequence() && - MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) + MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { InstToErase->eraseFromParent(); + Changed = true; + } } } - return true; + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index 80ee7a00252a..d7ca7f36284b 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -241,7 +241,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI, } // Check register def/use conflicts, occupancy limits and collect def/use maps. -// Return true if instruction can be bundled with previous. It it cannot +// Return true if instruction can be bundled with previous. If it cannot // def/use maps are not updated. bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6078f4a0577a..a57e81eb4e4a 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -749,7 +749,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, return; } - const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -789,19 +789,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, *Reg.FI); } - // VGPRs used for Whole Wave Mode - for (const auto &Reg : FuncInfo->WWMReservedRegs) { - auto VGPR = Reg.first; - auto FI = Reg.second; - if (!FI) - continue; - + for (auto ReservedWWM : FuncInfo->wwmAllocation()) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ true); - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR, - *FI); + buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + std::get<0>(ReservedWWM), std::get<1>(ReservedWWM)); } if (ScratchExecCopy) { @@ -813,9 +807,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, LiveRegs.addReg(ScratchExecCopy); } - if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) { - const int FramePtrFI = *FPSaveIndex; - assert(!MFI.isDeadObjectIndex(FramePtrFI)); + auto SaveSGPRToMemory = [&](Register Reg, const int FI) { + assert(!MFI.isDeadObjectIndex(FI)); initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); @@ -825,62 +818,31 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, report_fatal_error("failed to find free scratch register"); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(FramePtrReg); + .addReg(Reg); buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR, - FramePtrFI); - } - - if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) { - const int BasePtrFI = *BPSaveIndex; - assert(!MFI.isDeadObjectIndex(BasePtrFI)); - - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); + FI); + }; - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(BasePtrReg); + auto SaveSGPRToVGPRLane = [&](Register Reg, const int FI) { + assert(!MFI.isDeadObjectIndex(FI)); - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR, - BasePtrFI); - } - - // In this case, spill the FP to a reserved VGPR. - if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) { - const int FramePtrFI = *FPSaveIndex; - assert(!MFI.isDeadObjectIndex(FramePtrFI)); - - assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(FramePtrFI); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIRegisterInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); assert(Spill.size() == 1); - // Save FP before setting it up. BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) - .addReg(FramePtrReg) + .addReg(Reg) .addImm(Spill[0].Lane) .addReg(Spill[0].VGPR, RegState::Undef); - } + }; - // In this case, spill the BP to a reserved VGPR. - if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) { - const int BasePtrFI = *BPSaveIndex; - assert(!MFI.isDeadObjectIndex(BasePtrFI)); - - assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(BasePtrFI); - assert(Spill.size() == 1); - - // Save BP before setting it up. - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) - .addReg(BasePtrReg) - .addImm(Spill[0].Lane) - .addReg(Spill[0].VGPR, RegState::Undef); + if (FPSaveIndex) { + if (spilledToMemory(MF, *FPSaveIndex)) + SaveSGPRToMemory(FramePtrReg, *FPSaveIndex); + else + SaveSGPRToVGPRLane(FramePtrReg, *FPSaveIndex); } // Emit the copy if we need an FP, and are using a free SGPR to save it. @@ -891,6 +853,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + if (BPSaveIndex) { + if (spilledToMemory(MF, *BPSaveIndex)) + SaveSGPRToMemory(BasePtrReg, *BPSaveIndex); + else + SaveSGPRToVGPRLane(BasePtrReg, *BPSaveIndex); + } + // Emit the copy if we need a BP, and are using a free SGPR to save it. if (FuncInfo->SGPRForBPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), @@ -1034,56 +1003,44 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + auto RestoreSGPRFromMemory = [&](Register Reg, const int FI) { + initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + if (!TmpVGPR) + report_fatal_error("failed to find free scratch register"); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, TmpVGPR, + FI); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .addReg(TmpVGPR, RegState::Kill); + }; + + auto RestoreSGPRFromVGPRLane = [&](Register Reg, const int FI) { + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIRegisterInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), Reg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + }; + if (FPSaveIndex) { const int FramePtrFI = *FPSaveIndex; assert(!MFI.isDeadObjectIndex(FramePtrFI)); - if (spilledToMemory(MF, FramePtrFI)) { - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); - - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, - TmpVGPR, FramePtrFI); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) - .addReg(TmpVGPR, RegState::Kill); - } else { - // Reload from VGPR spill. - assert(MFI.getStackID(FramePtrFI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(FramePtrFI); - assert(Spill.size() == 1); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), FramePtrReg) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); - } + if (spilledToMemory(MF, FramePtrFI)) + RestoreSGPRFromMemory(FramePtrReg, FramePtrFI); + else + RestoreSGPRFromVGPRLane(FramePtrReg, FramePtrFI); } if (BPSaveIndex) { const int BasePtrFI = *BPSaveIndex; assert(!MFI.isDeadObjectIndex(BasePtrFI)); - if (spilledToMemory(MF, BasePtrFI)) { - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); - - MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); - if (!TmpVGPR) - report_fatal_error("failed to find free scratch register"); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, - TmpVGPR, BasePtrFI); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) - .addReg(TmpVGPR, RegState::Kill); - } else { - // Reload from VGPR spill. - assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = - FuncInfo->getSGPRToVGPRSpills(BasePtrFI); - assert(Spill.size() == 1); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READLANE_B32), BasePtrReg) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); - } + if (spilledToMemory(MF, BasePtrFI)) + RestoreSGPRFromMemory(BasePtrReg, BasePtrFI); + else + RestoreSGPRFromVGPRLane(BasePtrReg, BasePtrFI); } Register ScratchExecCopy; @@ -1100,18 +1057,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, Reg.VGPR, *Reg.FI); } - for (const auto &Reg : FuncInfo->WWMReservedRegs) { - auto VGPR = Reg.first; - auto FI = Reg.second; - if (!FI) - continue; - + for (auto ReservedWWM : FuncInfo->wwmAllocation()) { if (!ScratchExecCopy) ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, /*IsProlog*/ false); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR, - *FI); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + std::get<0>(ReservedWWM), std::get<1>(ReservedWWM)); } if (ScratchExecCopy) { @@ -1161,6 +1113,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + if (!FuncInfo->isEntryFunction()) { + // Spill VGPRs used for Whole Wave Mode + FuncInfo->allocateWWMReservedSpillSlots(MFI, *TRI); + } + const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs() && EnableSpillVGPRToAGPR; @@ -1200,7 +1157,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( } } - // Stack slot coloring may assign different objets to the same stack slot. + // Stack slot coloring may assign different objects to the same stack slot. // If not, then the VGPR to AGPR spill slot is dead. for (unsigned FI : SpillFIs.set_bits()) if (!NonVGPRSpillFIs.test(FI)) @@ -1229,7 +1186,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( } } - FuncInfo->removeDeadFrameIndices(MFI); + // At this point we've already allocated all spilled SGPRs to VGPRs if we + // can. Any remaining SGPR spills will go to memory, so move them back to the + // default stack. + bool HaveSGPRToVMemSpill = + FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true); assert(allSGPRSpillsAreDead(MF) && "SGPR spill should have been removed in SILowerSGPRSpills"); @@ -1241,6 +1202,39 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( // Add an emergency spill slot RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI)); + + // If we are spilling SGPRs to memory with a large frame, we may need a + // second VGPR emergency frame index. + if (HaveSGPRToVMemSpill && + allocateScavengingFrameIndexesNearIncomingSP(MF)) { + RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false)); + } + } +} + +void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + + if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { + // On gfx908, we had initially reserved highest available VGPR for AGPR + // copy. Now since we are done with RA, check if there exist an unused VGPR + // which is lower than the eariler reserved VGPR before RA. If one exist, + // use it for AGPR copy instead of one reserved before RA. + Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy(); + Register UnusedLowVGPR = + TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < + TRI->getHWRegIndex(VGPRForAGPRCopy))) { + // Call to setVGPRForAGPRCopy() should happen first before calling + // freezeReservedRegs() so that getReservedRegs() can reserve this newly + // identified VGPR (for AGPR copy). + FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); + MRI.freezeReservedRegs(MF); + } } } @@ -1333,6 +1327,20 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, // FP will be specially managed like SP. if (WillHaveFP || hasFP(MF)) SavedRegs.reset(MFI->getFrameOffsetReg()); + + // Return address use with return instruction is hidden through the SI_RETURN + // pseudo. Given that and since the IPRA computes actual register usage and + // does not use CSR list, the clobbering of return address by function calls + // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register + // usage collection. This will ensure save/restore of return address happens + // in those scenarios. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + Register RetAddrReg = TRI->getReturnAddressReg(MF); + if (!MFI->isEntryFunction() && + (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) { + SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0)); + SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1)); + } } bool SIFrameLowering::assignCalleeSavedSpillSlots( diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 7949dcfa6632..79154d494e91 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -47,6 +47,9 @@ public: MachineFunction &MF, RegScavenger *RS = nullptr) const override; + void processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const override; + MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e2f4a0896bc3..094d5cd58673 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -25,6 +26,7 @@ #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" @@ -136,6 +138,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -151,27 +155,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setBooleanVectorContents(ZeroOrOneBooleanContent); // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v3i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::LOAD, MVT::v5i32, Custom); - setOperationAction(ISD::LOAD, MVT::v6i32, Custom); - setOperationAction(ISD::LOAD, MVT::v7i32, Custom); - setOperationAction(ISD::LOAD, MVT::v8i32, Custom); - setOperationAction(ISD::LOAD, MVT::v16i32, Custom); - setOperationAction(ISD::LOAD, MVT::i1, Custom); - setOperationAction(ISD::LOAD, MVT::v32i32, Custom); + setOperationAction(ISD::LOAD, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1, + MVT::v32i32}, + Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v3i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setOperationAction(ISD::STORE, MVT::v5i32, Custom); - setOperationAction(ISD::STORE, MVT::v6i32, Custom); - setOperationAction(ISD::STORE, MVT::v7i32, Custom); - setOperationAction(ISD::STORE, MVT::v8i32, Custom); - setOperationAction(ISD::STORE, MVT::v16i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v32i32, Custom); + setOperationAction(ISD::STORE, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1, + MVT::v32i32}, + Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); @@ -198,81 +192,57 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, + {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); setOperationAction(ISD::SETCC, MVT::i1, Promote); - setOperationAction(ISD::SETCC, MVT::v2i1, Expand); - setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand); AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); - setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand); + setOperationAction(ISD::TRUNCATE, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32}, + Expand); + setOperationAction(ISD::FP_ROUND, + {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, + MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32}, + Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, + {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, + MVT::v3i16, MVT::v4i16, MVT::Other}, + Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::i64, Expand); - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::BR_CC, + {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); - setOperationAction(ISD::UADDO, MVT::i32, Legal); - setOperationAction(ISD::USUBO, MVT::i32, Legal); + setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); - setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); - setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); + setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal); - setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); - setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); - setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand); + setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, + Expand); #if 0 - setOperationAction(ISD::ADDCARRY, MVT::i64, Legal); - setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); + setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal); #endif // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, - MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32 }) { + for (MVT VT : + {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, + MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, + MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, + MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -372,94 +342,63 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); } - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, + {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, + Expand); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom); // Avoid stack access for these. // TODO: Generalize to more vector types. - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, + {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16}, + Custom); // Deal with vec3 vector operations when widened to vec4. - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, + {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom); // Deal with vec5/6/7 vector operations when widened to vec8. - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, + {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, + MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32}, + Custom); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom); // We can't return success/failure, only the old value, // let LLVM add the comparison - setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64}, + Expand); - if (Subtarget->hasFlatAddressSpace()) { - setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); - setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); - } + if (Subtarget->hasFlatAddressSpace()) + setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); - setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); - setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal); // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. - setOperationAction(ISD::BSWAP, MVT::i64, Legal); - setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal); // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - setOperationAction(ISD::TRAP, MVT::Other, Custom); - setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); + setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom); if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Custom); - setOperationAction(ISD::FEXP, MVT::f16, Custom); - setOperationAction(ISD::FLOG10, MVT::f16, Custom); + setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); + setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); } if (Subtarget->hasMadMacF32Insts()) setOperationAction(ISD::FMAD, MVT::f32, Legal); - if (!Subtarget->hasBFI()) { + if (!Subtarget->hasBFI()) // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } + setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); if (!Subtarget->hasBCNT(32)) setOperationAction(ISD::CTPOP, MVT::i32, Expand); @@ -467,15 +406,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (!Subtarget->hasBCNT(64)) setOperationAction(ISD::CTPOP, MVT::i64, Expand); - if (Subtarget->hasFFBH()) { - setOperationAction(ISD::CTLZ, MVT::i32, Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); - } + if (Subtarget->hasFFBH()) + setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); - if (Subtarget->hasFFBL()) { - setOperationAction(ISD::CTTZ, MVT::i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); - } + if (Subtarget->hasFFBL()) + setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); // We only really have 32-bit BFE instructions (and 16-bit on VI). // @@ -489,84 +424,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setHasExtractBitsInsn(true); // Clamp modifier on add/sub - if (Subtarget->hasIntClamp()) { - setOperationAction(ISD::UADDSAT, MVT::i32, Legal); - setOperationAction(ISD::USUBSAT, MVT::i32, Legal); - } - - if (Subtarget->hasAddNoCarry()) { - setOperationAction(ISD::SADDSAT, MVT::i16, Legal); - setOperationAction(ISD::SSUBSAT, MVT::i16, Legal); - setOperationAction(ISD::SADDSAT, MVT::i32, Legal); - setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); - } + if (Subtarget->hasIntClamp()) + setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Custom); - setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); - setOperationAction(ISD::FMINNUM, MVT::f64, Custom); - setOperationAction(ISD::FMAXNUM, MVT::f64, Custom); + if (Subtarget->hasAddNoCarry()) + setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, + Legal); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64}, + Custom); // These are really only legal for ieee_mode functions. We should be avoiding // them for functions that don't have ieee_mode enabled, so just say they are // legal. - setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, + {MVT::f32, MVT::f64}, Legal); - - if (Subtarget->haveRoundOpsF64()) { - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - } else { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } + if (Subtarget->haveRoundOpsF64()) + setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal); + else + setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, + MVT::f64, Custom); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::Constant, MVT::i16, Legal); - - setOperationAction(ISD::SMIN, MVT::i16, Legal); - setOperationAction(ISD::SMAX, MVT::i16, Legal); - - setOperationAction(ISD::UMIN, MVT::i16, Legal); - setOperationAction(ISD::UMAX, MVT::i16, Legal); + setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, + ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, + MVT::i16, Legal); - setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); - setOperationAction(ISD::ROTR, MVT::i16, Expand); - setOperationAction(ISD::ROTL, MVT::i16, Expand); - - setOperationAction(ISD::SDIV, MVT::i16, Promote); - setOperationAction(ISD::UDIV, MVT::i16, Promote); - setOperationAction(ISD::SREM, MVT::i16, Promote); - setOperationAction(ISD::UREM, MVT::i16, Promote); - setOperationAction(ISD::UADDSAT, MVT::i16, Legal); - setOperationAction(ISD::USUBSAT, MVT::i16, Legal); - - setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); - - setOperationAction(ISD::CTTZ, MVT::i16, Promote); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); - setOperationAction(ISD::CTLZ, MVT::i16, Promote); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); - setOperationAction(ISD::CTPOP, MVT::i16, Promote); + setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, + MVT::i16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); - - setOperationAction(ISD::BR_CC, MVT::i16, Expand); + setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, + ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, + ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, + ISD::CTPOP}, + MVT::i16, Promote); setOperationAction(ISD::LOAD, MVT::i16, Custom); @@ -577,8 +476,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -590,22 +488,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); // F16 - VOP1 Actions. - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FCOS, MVT::f16, Custom); - setOperationAction(ISD::FSIN, MVT::f16, Custom); + setOperationAction( + {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, + MVT::f16, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Custom); + setOperationAction( + {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP}, + MVT::f16, Promote); // F16 - VOP2 Actions. - setOperationAction(ISD::BR_CC, MVT::f16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); setOperationAction(ISD::FDIV, MVT::f16, Custom); @@ -615,7 +509,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16}) { + MVT::v8f16, MVT::v16i16, MVT::v16f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -639,16 +533,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } // v_perm_b32 can handle either of these. - setOperationAction(ISD::BSWAP, MVT::i16, Legal); - setOperationAction(ISD::BSWAP, MVT::v2i16, Legal); + setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal); setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); // XXX - Do these do anything? Vector constants turn into build_vector. - setOperationAction(ISD::Constant, MVT::v2i16, Legal); - setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); - setOperationAction(ISD::UNDEF, MVT::v2i16, Legal); - setOperationAction(ISD::UNDEF, MVT::v2f16, Legal); + setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal); setOperationAction(ISD::STORE, MVT::v2i16, Promote); AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); @@ -692,140 +583,98 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v8f16, Promote); AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::LOAD, MVT::v16i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v16f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); + setOperationAction(ISD::STORE, MVT::v16f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, + MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, + MVT::v4i32, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, + MVT::v8i32, Expand); - if (!Subtarget->hasVOP3PInsts()) { - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); - } + if (!Subtarget->hasVOP3PInsts()) + setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom); setOperationAction(ISD::FNEG, MVT::v2f16, Legal); // This isn't really legal, but this avoids the legalizer unrolling it (and // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f16, Custom); - setOperationAction(ISD::FMINNUM, MVT::f16, Custom); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal); + setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); + setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); - setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom); + setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, + {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); - setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); - setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); - setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand); - setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, + {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); - for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) { - setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom); + for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + setOperationAction( + {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, + Vec16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand); } } if (Subtarget->hasVOP3PInsts()) { - setOperationAction(ISD::ADD, MVT::v2i16, Legal); - setOperationAction(ISD::SUB, MVT::v2i16, Legal); - setOperationAction(ISD::MUL, MVT::v2i16, Legal); - setOperationAction(ISD::SHL, MVT::v2i16, Legal); - setOperationAction(ISD::SRL, MVT::v2i16, Legal); - setOperationAction(ISD::SRA, MVT::v2i16, Legal); - setOperationAction(ISD::SMIN, MVT::v2i16, Legal); - setOperationAction(ISD::UMIN, MVT::v2i16, Legal); - setOperationAction(ISD::SMAX, MVT::v2i16, Legal); - setOperationAction(ISD::UMAX, MVT::v2i16, Legal); - - setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal); - setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal); - setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal); - setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal); - - setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FMUL, MVT::v2f16, Legal); - setOperationAction(ISD::FMA, MVT::v2f16, Legal); + setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, + ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, + ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, + MVT::v2i16, Legal); - setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal); - setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal); + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, + ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, + MVT::v2f16, Legal); - setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16}, + Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, + {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, + MVT::v16f16, MVT::v16i16}, + Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); - - for (MVT VT : { MVT::v4i16, MVT::v8i16 }) { + for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) // Split vector operations. - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::ADD, VT, Custom); - setOperationAction(ISD::SUB, VT, Custom); - setOperationAction(ISD::MUL, VT, Custom); - - setOperationAction(ISD::SMIN, VT, Custom); - setOperationAction(ISD::SMAX, VT, Custom); - setOperationAction(ISD::UMIN, VT, Custom); - setOperationAction(ISD::UMAX, VT, Custom); - - setOperationAction(ISD::UADDSAT, VT, Custom); - setOperationAction(ISD::SADDSAT, VT, Custom); - setOperationAction(ISD::USUBSAT, VT, Custom); - setOperationAction(ISD::SSUBSAT, VT, Custom); - } + setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, + ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, + ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, + ISD::SSUBSAT}, + VT, Custom); - for (MVT VT : { MVT::v4f16, MVT::v8f16 }) { + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) // Split vector operations. - setOperationAction(ISD::FADD, VT, Custom); - setOperationAction(ISD::FMUL, VT, Custom); - setOperationAction(ISD::FMA, VT, Custom); - setOperationAction(ISD::FCANONICALIZE, VT, Custom); - } - - setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); - setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, + VT, Custom); - setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); - setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16}, + Custom); setOperationAction(ISD::FEXP, MVT::v2f16, Custom); - setOperationAction(ISD::SELECT, MVT::v4i16, Custom); - setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom); if (Subtarget->hasPackedFP32Ops()) { - setOperationAction(ISD::FADD, MVT::v2f32, Legal); - setOperationAction(ISD::FMUL, MVT::v2f32, Legal); - setOperationAction(ISD::FMA, MVT::v2f32, Legal); - setOperationAction(ISD::FNEG, MVT::v2f32, Legal); - - for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) { - setOperationAction(ISD::FADD, VT, Custom); - setOperationAction(ISD::FMUL, VT, Custom); - setOperationAction(ISD::FMA, VT, Custom); - } + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, + MVT::v2f32, Legal); + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA}, + {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, + Custom); } } - setOperationAction(ISD::FNEG, MVT::v4f16, Custom); - setOperationAction(ISD::FABS, MVT::v4f16, Custom); + setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); if (Subtarget->has16BitInsts()) { setOperationAction(ISD::SELECT, MVT::v2i16, Promote); @@ -834,107 +683,88 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { // Legalization hack. - setOperationAction(ISD::SELECT, MVT::v2i16, Custom); - setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom); - setOperationAction(ISD::FNEG, MVT::v2f16, Custom); - setOperationAction(ISD::FABS, MVT::v2f16, Custom); + setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom); } - for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16 }) { - setOperationAction(ISD::SELECT, VT, Custom); - } + setOperationAction(ISD::SELECT, + {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, + Custom); - setOperationAction(ISD::SMULO, MVT::i64, Custom); - setOperationAction(ISD::UMULO, MVT::i64, Custom); + setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); - if (Subtarget->hasMad64_32()) { - setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom); - setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom); - } + if (Subtarget->hasMad64_32()) + setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, + {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, + MVT::v2i16, MVT::v2f16}, + Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, + {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, + MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, + MVT::i16, MVT::i8}, + Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_VOID, + {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, + MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, + MVT::i8}, + Custom); - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::ADDCARRY); - setTargetDAGCombine(ISD::SUB); - setTargetDAGCombine(ISD::SUBCARRY); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FMINNUM); - setTargetDAGCombine(ISD::FMAXNUM); - setTargetDAGCombine(ISD::FMINNUM_IEEE); - setTargetDAGCombine(ISD::FMAXNUM_IEEE); - setTargetDAGCombine(ISD::FMA); - setTargetDAGCombine(ISD::SMIN); - setTargetDAGCombine(ISD::SMAX); - setTargetDAGCombine(ISD::UMIN); - setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::XOR); - setTargetDAGCombine(ISD::SINT_TO_FP); - setTargetDAGCombine(ISD::UINT_TO_FP); - setTargetDAGCombine(ISD::FCANONICALIZE); - setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine({ISD::ADD, + ISD::ADDCARRY, + ISD::SUB, + ISD::SUBCARRY, + ISD::FADD, + ISD::FSUB, + ISD::FMINNUM, + ISD::FMAXNUM, + ISD::FMINNUM_IEEE, + ISD::FMAXNUM_IEEE, + ISD::FMA, + ISD::SMIN, + ISD::SMAX, + ISD::UMIN, + ISD::UMAX, + ISD::SETCC, + ISD::AND, + ISD::OR, + ISD::XOR, + ISD::SINT_TO_FP, + ISD::UINT_TO_FP, + ISD::FCANONICALIZE, + ISD::SCALAR_TO_VECTOR, + ISD::ZERO_EXTEND, + ISD::SIGN_EXTEND_INREG, + ISD::EXTRACT_VECTOR_ELT, + ISD::INSERT_VECTOR_ELT}); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. - setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::ATOMIC_LOAD); - setTargetDAGCombine(ISD::ATOMIC_STORE); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); - setTargetDAGCombine(ISD::ATOMIC_SWAP); - setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); - setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); - setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); - setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine({ISD::LOAD, + ISD::STORE, + ISD::ATOMIC_LOAD, + ISD::ATOMIC_STORE, + ISD::ATOMIC_CMP_SWAP, + ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, + ISD::ATOMIC_SWAP, + ISD::ATOMIC_LOAD_ADD, + ISD::ATOMIC_LOAD_SUB, + ISD::ATOMIC_LOAD_AND, + ISD::ATOMIC_LOAD_OR, + ISD::ATOMIC_LOAD_XOR, + ISD::ATOMIC_LOAD_NAND, + ISD::ATOMIC_LOAD_MIN, + ISD::ATOMIC_LOAD_MAX, + ISD::ATOMIC_LOAD_UMIN, + ISD::ATOMIC_LOAD_UMAX, + ISD::ATOMIC_LOAD_FADD, + ISD::INTRINSIC_VOID, + ISD::INTRINSIC_W_CHAIN}); // FIXME: In other contexts we pretend this is a per-function property. setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); @@ -1118,6 +948,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, unsigned IntrID) const { + Info.flags = MachineMemOperand::MONone; + if (CI.hasMetadata(LLVMContext::MD_invariant_load)) + Info.flags |= MachineMemOperand::MOInvariant; + if (const AMDGPU::RsrcIntrinsic *RsrcIntr = AMDGPU::lookupRsrcIntrinsic(IntrID)) { AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), @@ -1127,16 +961,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + if (RsrcIntr->IsImage) { - Info.ptrVal = - MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + Info.ptrVal = MFI->getImagePSV(TM); Info.align.reset(); } else { - Info.ptrVal = - MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + Info.ptrVal = MFI->getBufferPSV(TM); } - Info.flags = MachineMemOperand::MODereferenceable; + Info.flags |= MachineMemOperand::MODereferenceable; if (Attr.hasFnAttr(Attribute::ReadOnly)) { unsigned DMaskLanes = 4; @@ -1178,12 +1013,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable; + Info.flags |= MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; // XXX - Should this be volatile without known ordering? Info.flags |= MachineMemOperand::MOVolatile; + + switch (IntrID) { + default: + break; + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: { + unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + return true; + } + } } return true; } @@ -1200,7 +1046,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); if (!Vol->isZero()) @@ -1211,12 +1057,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_buffer_atomic_fadd: { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.ptrVal = - MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + Info.ptrVal = MFI->getBufferPSV(TM); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); if (!Vol || !Vol->isZero()) @@ -1230,7 +1078,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); if (!Vol->isZero()) @@ -1243,20 +1091,23 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile; + Info.flags |= MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; return true; } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? - Info.ptrVal = - MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + + Info.ptrVal = MFI->getImagePSV(TM); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable; + Info.flags |= MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; return true; } case Intrinsic::amdgcn_global_atomic_fadd: @@ -1264,15 +1115,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; + Info.flags |= MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; return true; } case Intrinsic::amdgcn_ds_gws_init: @@ -1283,18 +1136,29 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_ds_gws_sema_release_all: { Info.opc = ISD::INTRINSIC_VOID; + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.ptrVal = - MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + Info.ptrVal = MFI->getGWSPSV(TM); // This is an abstract access, but we need to specify a type and size. Info.memVT = MVT::i32; Info.size = 4; Info.align = Align(4); - Info.flags = MachineMemOperand::MOStore; if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) - Info.flags = MachineMemOperand::MOLoad; + Info.flags |= MachineMemOperand::MOLoad; + else + Info.flags |= MachineMemOperand::MOStore; + return true; + } + case Intrinsic::amdgcn_global_load_lds: { + Info.opc = ISD::INTRINSIC_VOID; + unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; return true; } default: @@ -1319,6 +1183,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -1506,47 +1372,96 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( AddrSpace == AMDGPUAS::REGION_ADDRESS) { // Check if alignment requirements for ds_read/write instructions are // disabled. - if (Subtarget->hasUnalignedDSAccessEnabled() && - !Subtarget->hasLDSMisalignedBug()) { - if (IsFast) - *IsFast = Alignment != Align(2); - return true; - } + if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) + return false; + + Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment. + if (Subtarget->hasLDSMisalignedBug() && Size > 32 && + Alignment < RequiredAlignment) + return false; // Either, the alignment requirements are "enabled", or there is an // unaligned LDS access related hardware bug though alignment requirements // are "disabled". In either case, we need to check for proper alignment // requirements. // - if (Size == 64) { + switch (Size) { + case 64: + // SI has a hardware bug in the LDS / GDS bounds checking: if the base + // address is negative, then the instruction is incorrectly treated as + // out-of-bounds even if base + offsets is in bounds. Split vectorized + // loads here to avoid emitting ds_read2_b32. We may re-combine the + // load later in the SILoadStoreOptimizer. + if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) + return false; + // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we // can do a 4 byte aligned, 8 byte access in a single operation using // ds_read2/write2_b32 with adjacent offsets. - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; + RequiredAlignment = Align(4); + + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ + // ds_write2_b32 depending on the alignment. In either case with either + // alignment there is no faster way of doing this. + if (IsFast) + *IsFast = true; + return true; + } + + break; + case 96: + if (!Subtarget->hasDS96AndDS128()) + return false; - return AlignedBy4; - } - if (Size == 96) { // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on // gfx8 and older. - bool AlignedBy16 = Alignment >= Align(16); - if (IsFast) - *IsFast = AlignedBy16; - return AlignedBy16; - } - if (Size == 128) { + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // Naturally aligned access is fastest. However, also report it is Fast + // if memory is aligned less than DWORD. A narrow load or store will be + // be equally slow as a single ds_read_b96/ds_write_b96, but there will + // be more of them, so overall we will pay less penalty issuing a single + // instruction. + if (IsFast) + *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4); + return true; + } + + break; + case 128: + if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) + return false; + // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a // single operation using ds_read2/write2_b64. - bool AlignedBy8 = Alignment >= Align(8); - if (IsFast) - *IsFast = AlignedBy8; + RequiredAlignment = Align(8); + + if (Subtarget->hasUnalignedDSAccessEnabled()) { + // Naturally aligned access is fastest. However, also report it is Fast + // if memory is aligned less than DWORD. A narrow load or store will be + // be equally slow as a single ds_read_b128/ds_write_b128, but there + // will be more of them, so overall we will pay less penalty issuing a + // single instruction. + if (IsFast) + *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4); + return true; + } + + break; + default: + if (Size > 32) + return false; - return AlignedBy8; + break; } + + if (IsFast) + *IsFast = Alignment >= RequiredAlignment; + + return Alignment >= RequiredAlignment || + Subtarget->hasUnalignedDSAccessEnabled(); } if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { @@ -1571,14 +1486,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccessEnabled() && - !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS)) { - // If we have an uniform constant load, it still requires using a slow + if (Subtarget->hasUnalignedBufferAccessEnabled()) { + // If we have a uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so - // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. + // 2-byte alignment is worse than 1 unless doing a 2-byte access. *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? Alignment >= Align(4) : Alignment != Align(2); @@ -1603,20 +1516,22 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( bool SITargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *IsFast) const { - if (IsFast) - *IsFast = false; + bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Alignment, Flags, IsFast); - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - // Until MVT is extended to handle this, simply check for the size and - // rely on the condition below: allow accesses if the size is a multiple of 4. - if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && - VT.getStoreSize() > 16)) { - return false; + if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() && + (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS)) { + // Lie it is fast if +unaligned-access-mode is passed so that DS accesses + // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a + // misaligned data which is faster than a pair of ds_read_b*/ds_write_b* + // which would be equally misaligned. + // This is only used by the common passes, selection always calls the + // allowsMisalignedMemoryAccessesImpl version. + *IsFast = true; } - return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Alignment, Flags, IsFast); + return Allow; } EVT SITargetLowering::getOptimalMemOpType( @@ -1639,9 +1554,7 @@ EVT SITargetLowering::getOptimalMemOpType( bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); - const Value *Ptr = MemNode->getMemOperand()->getValue(); - const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); - return I && I->getMetadata("amdgpu.noclobber"); + return MemNode->getMemOperand()->getFlags() & MONoClobber; } bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { @@ -1681,6 +1594,15 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // TODO: Add more cases that are cheap. + return Index == 0; +} + bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { if (Subtarget->has16BitInsts() && VT == MVT::i16) { switch (Op) { @@ -2106,7 +2028,7 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (Info.hasQueuePtr()) + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2153,7 +2075,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -2190,6 +2112,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { + if (Subtarget->hasUserSGPRInit16Bug()) { + // Pad up the used user SGPRs with dead inputs. + unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); + + // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to + // rely on it to reach 16 since if we end up having no stack usage, it will + // not really be added. + unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() + + Info.hasWorkGroupIDY() + + Info.hasWorkGroupIDZ() + + Info.hasWorkGroupInfo(); + for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { + Register Reg = Info.addReservedUserSGPR(); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + } + if (Info.hasWorkGroupIDX()) { Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); @@ -2234,6 +2174,8 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } + + assert(!Subtarget->hasUserSGPRInit16Bug() || Info.getNumPreloadedSGPRs() >= 16); } static void reservePrivateMemoryRegs(const TargetMachine &TM, @@ -2388,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } - Info->allocateModuleLDSGlobal(Fn.getParent()); + Info->allocateModuleLDSGlobal(Fn); SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; @@ -2538,7 +2480,13 @@ SDValue SITargetLowering::LowerFormalArguments( assert(VA.isRegLoc() && "Parameter must be in a register!"); Register Reg = VA.getLocReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + const TargetRegisterClass *RC = nullptr; + if (AMDGPU::VGPR_32RegClass.contains(Reg)) + RC = &AMDGPU::VGPR_32RegClass; + else if (AMDGPU::SGPR_32RegClass.contains(Reg)) + RC = &AMDGPU::SGPR_32RegClass; + else + llvm_unreachable("Unexpected register class in LowerFormalArguments!"); EVT ValVT = VA.getValVT(); Reg = MF.addLiveIn(Reg, RC); @@ -2657,24 +2605,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector<SDValue, 48> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) - // Add return address for callable functions. - if (!Info->isEntryFunction()) { - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - SDValue ReturnAddrReg = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - - SDValue ReturnAddrVirtualReg = - DAG.getRegister(MF.getRegInfo().createVirtualRegister( - CallConv != CallingConv::AMDGPU_Gfx - ? &AMDGPU::CCR_SGPR_64RegClass - : &AMDGPU::Gfx_CCR_SGPR_64RegClass), - MVT::i64); - Chain = - DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); - Flag = Chain.getValue(1); - RetOps.push_back(ReturnAddrVirtualReg); - } - // Copy the result values into the output registers. for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; ++I, ++RealRVLocIdx) { @@ -2731,15 +2661,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetOps.push_back(Flag); unsigned Opc = AMDGPUISD::ENDPGM; - if (!IsWaveEnd) { - if (IsShader) - Opc = AMDGPUISD::RETURN_TO_EPILOG; - else if (CallConv == CallingConv::AMDGPU_Gfx) - Opc = AMDGPUISD::RET_GFX_FLAG; - else - Opc = AMDGPUISD::RET_FLAG; - } - + if (!IsWaveEnd) + Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -3321,21 +3244,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } - SDValue PhysReturnAddrReg; - if (IsTailCall) { - // Since the return is being combined with the call, we need to pass on the - // return address. - - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - SDValue ReturnAddrReg = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - - PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); - InFlag = Chain.getValue(1); - } - // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be @@ -3365,8 +3273,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); - - Ops.push_back(PhysReturnAddrReg); } // Add argument registers to the end of the list so that they are known live @@ -4104,6 +4010,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + if (IsAdd && ST.hasLshlAddB64()) { + auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), + Dest.getReg()) + .add(Src0) + .addImm(0) + .add(Src1); + TII->legalizeOperands(*Add); + MI.eraseFromParent(); + return BB; + } + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -4112,10 +4033,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( Register CarryReg = MRI.createVirtualRegister(CarryRC); Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); - MachineOperand &Dest = MI.getOperand(0); - MachineOperand &Src0 = MI.getOperand(1); - MachineOperand &Src1 = MI.getOperand(2); - const TargetRegisterClass *Src0RC = Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::VReg_64RegClass; @@ -4390,29 +4307,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::DS_GWS_INIT: case AMDGPU::DS_GWS_SEMA_BR: case AMDGPU::DS_GWS_BARRIER: - if (Subtarget->needsAlignedVGPRs()) { - // Add implicit aligned super-reg to force alignment on the data operand. - const DebugLoc &DL = MI.getDebugLoc(); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0); - Register DataReg = Op->getReg(); - bool IsAGPR = TRI->isAGPR(MRI, DataReg); - Register Undef = MRI.createVirtualRegister( - IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); - Register NewVR = - MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass - : &AMDGPU::VReg_64_Align2RegClass); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR) - .addReg(DataReg, 0, Op->getSubReg()) - .addImm(AMDGPU::sub0) - .addReg(Undef) - .addImm(AMDGPU::sub1); - Op->setReg(NewVR); - Op->setSubReg(AMDGPU::sub0); - MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); - } + TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); LLVM_FALLTHROUGH; case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: @@ -4500,6 +4395,18 @@ bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const { return isTypeLegal(VT.getScalarType()); } +bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const { + switch (Op.getValue(0).getSimpleValueType().SimpleTy) { + case MVT::f32: + return Subtarget->hasAtomicFaddRtnInsts(); + case MVT::v2f16: + case MVT::f64: + return Subtarget->hasGFX90AInsts(); + default: + return false; + } +} + bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // This currently forces unfolding various combinations of fsub into fma with // free fneg'd operands. As long as we have fast FMA (controlled by @@ -4560,7 +4467,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, // Otherwise f32 mad is always full rate and returns the same result as // the separate operations so should be preferred over fma. - // However does not support denomals. + // However does not support denormals. if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); @@ -4653,8 +4560,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || + VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4676,8 +4584,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || - VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || + VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32); SDValue Lo0, Hi0; SDValue Op0 = Op.getOperand(0); @@ -4738,10 +4647,30 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SCALAR_TO_VECTOR: + return lowerSCALAR_TO_VECTOR(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); + case ISD::FPTRUNC_ROUND: { + unsigned Opc; + SDLoc DL(Op); + + if (Op.getOperand(0)->getValueType(0) != MVT::f32) + return SDValue(); + + // Get the rounding mode from the last operand + int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (RoundMode == (int)RoundingMode::TowardPositive) + Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD; + else if (RoundMode == (int)RoundingMode::TowardNegative) + Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; + else + return SDValue(); + + return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0)); + } case ISD::TRAP: return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: @@ -5356,7 +5285,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16 || VT == MVT::v8f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5439,24 +5368,41 @@ SDValue SITargetLowering::lowerTrapEndpgm( return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); } +SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, + const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { + MachineFunction &MF = DAG.getMachineFunction(); + uint64_t Offset = getImplicitParameterOffset(MF, Param); + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + SDValue SITargetLowering::lowerTrapHsaQueuePtr( SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - Register UserSGPR = Info->getQueuePtrUserSGPR(); - SDValue QueuePtr; - if (UserSGPR == AMDGPU::NoRegister) { - // We probably are in a function incorrectly marked with - // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap, - // so just use a null pointer. - QueuePtr = DAG.getConstant(0, SL, MVT::i64); + // For code object version 5, QueuePtr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + QueuePtr = + loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); } else { - QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); + + if (UserSGPR == AMDGPU::NoRegister) { + // We probably are in a function incorrectly marked with + // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the + // trap, so just use a null pointer. + QueuePtr = DAG.getConstant(0, SL, MVT::i64); + } else { + QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, + MVT::i64); + } } SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); @@ -5532,6 +5478,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); } + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + ImplicitParameter Param = + (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; + return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Register UserSGPR = Info->getQueuePtrUserSGPR(); @@ -5691,14 +5645,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, EVT EltVT = VecVT.getVectorElementType(); unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + SDLoc SL(Op); - - assert(VecSize <= 64); - + // Specially handle the case of v4i16 with static indexing. unsigned NumElts = VecVT.getVectorNumElements(); - SDLoc SL(Op); auto KIdx = dyn_cast<ConstantSDNode>(Idx); - if (NumElts == 4 && EltSize == 16 && KIdx) { SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); @@ -5726,35 +5677,41 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); } + // Static indexing does not lower to stack access, and hence there is no need + // for special custom lowering to avoid stack access. if (isa<ConstantSDNode>(Idx)) return SDValue(); - MVT IntVT = MVT::getIntegerVT(VecSize); - - // Avoid stack access for dynamic indexing. + // Avoid stack access for dynamic indexing by custom lowering to // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - // Create a congruent vector with the target value in each element so that - // the required element can be masked and ORed into the target vector. - SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, - DAG.getSplatBuildVector(VecVT, SL, InsVal)); + assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits"); + + MVT IntVT = MVT::getIntegerVT(VecSize); + // Convert vector index to bit-index and get the required bit mask. assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - - // Convert vector index to bit-index. SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); + // 1. Create a congruent vector with the target value in each element. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); + + // 2. Mask off all other indicies except the required index within (1). SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + + // 3. Mask off the required index within the target vector. + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); + // 4. Get (2) and (3) ORed into the target vector. SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); } @@ -5778,17 +5735,35 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (VecSize == 128) { + if (VecSize == 128 || VecSize == 256) { SDValue Lo, Hi; EVT LoVT, HiVT; - SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); - Lo = - DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, - V2, DAG.getConstant(0, SL, MVT::i32))); - Hi = - DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, - V2, DAG.getConstant(1, SL, MVT::i32))); + + if (VecSize == 128) { + SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); + Lo = DAG.getBitcast(LoVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(0, SL, MVT::i32))); + Hi = DAG.getBitcast(HiVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(1, SL, MVT::i32))); + } else { + assert(VecSize == 256); + + SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); + SDValue Parts[4]; + for (unsigned P = 0; P < 4; ++P) { + Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(P, SL, MVT::i32)); + } + + Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, + Parts[0], Parts[1])); + Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, + Parts[2], Parts[3])); + } + EVT IdxVT = Idx.getValueType(); unsigned NElem = VecVT.getVectorNumElements(); assert(isPowerOf2_32(NElem)); @@ -5800,10 +5775,19 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, assert(VecSize <= 64); + MVT IntVT = MVT::getIntegerVT(VecSize); + + // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. + SDValue VecBC = peekThroughBitcasts(Vec); + if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { + SDValue Src = VecBC.getOperand(0); + Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src); + Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT); + } + unsigned EltSize = EltVT.getSizeInBits(); assert(isPowerOf2_32(EltSize)); - MVT IntVT = MVT::getIntegerVT(VecSize); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); // Convert vector index to bit-index (* EltSize) @@ -5877,6 +5861,22 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); } +SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue SVal = Op.getOperand(0); + EVT ResultVT = Op.getValueType(); + EVT SValVT = SVal.getValueType(); + SDValue UndefVal = DAG.getUNDEF(SValVT); + SDLoc SL(Op); + + SmallVector<SDValue, 8> VElts; + VElts.push_back(SVal); + for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) + VElts.push_back(UndefVal); + + return DAG.getBuildVector(ResultVT, SL, VElts); +} + SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -5906,6 +5906,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } + if (VT == MVT::v16i16 || VT == MVT::v16f16) { + EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 4); + MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + + SmallVector<SDValue, 4> Parts[4]; + for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { + for (unsigned P = 0; P < 4; ++P) + Parts[P].push_back(Op.getOperand(I + P * E)); + } + SDValue Casts[4]; + for (unsigned P = 0; P < 4; ++P) { + SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); + Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + } + + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -6277,6 +6298,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); + bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); @@ -6455,6 +6477,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. + // + // TODO: we can actually allow partial NSA where the final register is a + // contiguous set of the remaining addresses. + // This could help where there are more addresses than supported. bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3 && VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); @@ -6561,7 +6587,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (IsGFX10Plus) { + if (IsGFX11Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx11Default, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, @@ -6685,6 +6716,32 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, return Loads[0]; } +SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, + unsigned Dim, + const ArgDescriptor &Arg) const { + SDLoc SL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); + if (MaxID == 0) + return DAG.getConstant(0, SL, MVT::i32); + + SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), Arg); + + // Don't bother inserting AssertZext for packed IDs since we're emitting the + // masking operations anyway. + // + // TODO: We could assert the top bit is 0 for the source copy. + if (Arg.isMasked()) + return Val; + + // Preserve the known bits after expansion to a copy. + EVT SmallVT = + EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID)); + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, + DAG.getValueType(SmallVT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -6831,26 +6888,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDX); + return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDY); + return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDZ); + return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); case Intrinsic::amdgcn_wavefrontsize: return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), SDLoc(Op), MVT::i32); @@ -7157,12 +7199,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction()); unsigned Offset0 = OrderedCountIndex << 2; - unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | - (Instruction << 4); + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) Offset1 |= (CountDw - 1) << 6; + if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) + Offset1 |= ShaderType << 2; + unsigned Offset = Offset0 | (Offset1 << 8); SDValue Ops[] = { @@ -7441,7 +7485,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { + if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) { DiagnosticInfoUnsupported NoFpRet(DAG.getMachineFunction().getFunction(), "return versions of fp atomics not supported", @@ -7609,12 +7653,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; const bool Is64 = NodePtr.getValueType() == MVT::i64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = Subtarget->hasNSAEncoding() && - NumVAddrDwords <= Subtarget->getNSAMaxSize(); + const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; + const bool UseNSA = + Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -7622,12 +7668,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, int Opcode; if (UseNSA) { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, - NumVAddrDwords); + IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); } else { - Opcode = AMDGPU::getMIMGOpcode( - BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords, - PowerOf2Ceil(NumVAddrDwords)); + Opcode = + AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); } assert(Opcode != -1); @@ -7660,15 +7709,36 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } }; - if (Is64) - DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2); - else + if (UseNSA && IsGFX11Plus) { Ops.push_back(NodePtr); + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + Ops.push_back(RayOrigin); + if (IsA16) { + SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; + DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3); + DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3); + for (unsigned I = 0; I < 3; ++I) { + MergedLanes.push_back(DAG.getBitcast( + MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, + {DirLanes[I], InvDirLanes[I]}))); + } + Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); + } else { + Ops.push_back(RayDir); + Ops.push_back(RayInvDir); + } + } else { + if (Is64) + DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, + 2); + else + Ops.push_back(NodePtr); - Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); - packLanes(RayOrigin, true); - packLanes(RayDir, true); - packLanes(RayInvDir, false); + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + packLanes(RayOrigin, true); + packLanes(RayDir, true); + packLanes(RayInvDir, false); + } if (!UseNSA) { // Build a single vector containing all the operands so far prepared. @@ -7868,6 +7938,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_exp_compr: { + if (!Subtarget->hasCompressedExport()) { + DiagnosticInfoUnsupported BadIntrin( + DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + } SDValue Src0 = Op.getOperand(4); SDValue Src1 = Op.getOperand(5); // Hack around illegal type on SI by directly selecting it. @@ -8110,6 +8186,160 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: { + unsigned Opc; + bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds; + unsigned OpOffset = HasVIndex ? 1 : 0; + SDValue VOffset = Op.getOperand(5 + OpOffset); + auto CVOffset = dyn_cast<ConstantSDNode>(VOffset); + bool HasVOffset = !CVOffset || !CVOffset->isZero(); + unsigned Size = Op->getConstantOperandVal(4); + + switch (Size) { + default: + return SDValue(); + case 1: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; + break; + case 2: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; + break; + case 4: + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; + break; + } + + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + + SmallVector<SDValue, 8> Ops; + + if (HasVIndex && HasVOffset) + Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, + { Op.getOperand(5), // VIndex + VOffset })); + else if (HasVIndex) + Ops.push_back(Op.getOperand(5)); + else if (HasVOffset) + Ops.push_back(VOffset); + + Ops.push_back(Op.getOperand(2)); // rsrc + Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset + Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset + unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); + Ops.push_back( + DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol + Ops.push_back( + DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz + Ops.push_back(M0Val.getValue(0)); // Chain + Ops.push_back(M0Val.getValue(1)); // Glue + + auto *M = cast<MemSDNode>(Op); + MachineMemOperand *LoadMMO = M->getMemOperand(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset); + MachinePointerInfo StorePtrI = LoadPtrI; + StorePtrI.V = nullptr; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + + MachineMemOperand *StoreMMO = + MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), LoadMMO->getBaseAlign()); + + auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); + DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + + return SDValue(Load, 0); + } + case Intrinsic::amdgcn_global_load_lds: { + unsigned Opc; + unsigned Size = Op->getConstantOperandVal(4); + switch (Size) { + default: + return SDValue(); + case 1: + Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; + break; + case 2: + Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; + break; + case 4: + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; + break; + } + + auto *M = cast<MemSDNode>(Op); + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + + SmallVector<SDValue, 6> Ops; + + SDValue Addr = Op.getOperand(2); // Global ptr + SDValue VOffset; + // Try to split SAddr and VOffset. Global and LDS pointers share the same + // immediate offset, so we cannot use a regular SelectGlobalSAddr(). + if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { + SDValue LHS = Addr.getOperand(0); + SDValue RHS = Addr.getOperand(1); + + if (LHS->isDivergent()) + std::swap(LHS, RHS); + + if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOperand(0).getValueType() == MVT::i32) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + Addr = LHS; + VOffset = RHS.getOperand(0); + } + } + + Ops.push_back(Addr); + if (!Addr->isDivergent()) { + Opc = AMDGPU::getGlobalSaddrOp(Opc); + if (!VOffset) + VOffset = SDValue( + DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, + DAG.getTargetConstant(0, DL, MVT::i32)), 0); + Ops.push_back(VOffset); + } + + Ops.push_back(Op.getOperand(5)); // Offset + Ops.push_back(Op.getOperand(6)); // CPol + Ops.push_back(M0Val.getValue(0)); // Chain + Ops.push_back(M0Val.getValue(1)); // Glue + + MachineMemOperand *LoadMMO = M->getMemOperand(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = Op->getConstantOperandVal(5); + MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = + MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), Align(4)); + + auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + + return SDValue(Load, 0); + } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -8271,7 +8501,7 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - if (Ld->getAlignment() < 4 || Ld->isDivergent()) + if (Ld->getAlign() < Align(4) || Ld->isDivergent()) return SDValue(); // FIXME: Constant loads should all be marked invariant. @@ -8296,14 +8526,11 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const // TODO: Drop only high part of range. SDValue Ptr = Ld->getBasePtr(); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - MVT::i32, SL, Ld->getChain(), Ptr, - Ld->getOffset(), - Ld->getPointerInfo(), MVT::i32, - Ld->getAlignment(), - Ld->getMemOperand()->getFlags(), - Ld->getAAInfo(), - nullptr); // Drop ranges + SDValue NewLoad = DAG.getLoad( + ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, + Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), + Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), + nullptr); // Drop ranges EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); if (MemVT.isFloatingPoint()) { @@ -8392,17 +8619,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned Alignment = Load->getAlignment(); + Align Alignment = Load->getAlign(); unsigned AS = Load->getAddressSpace(); - if (Subtarget->hasLDSMisalignedBug() && - AS == AMDGPUAS::FLAT_ADDRESS && - Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { + if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && + Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { return SplitVectorLoad(Op, DAG); } MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - // If there is a possibilty that flat instruction access scratch memory + // If there is a possibility that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) @@ -8413,7 +8639,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { + if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); @@ -8429,7 +8655,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4 && NumElements < 32) { + Alignment >= Align(4) && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); return WidenOrSplitVectorLoad(Op, DAG); @@ -8479,27 +8705,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_read_b128 or ds_read_b96 when possible. - if (Subtarget->hasDS96AndDS128() && - ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) || - MemVT.getStoreSize() == 12) && - allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, - Load->getAlign())) + bool Fast = false; + auto Flags = Load->getMemOperand()->getFlags(); + if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, + Load->getAlign(), Flags, &Fast) && + Fast) return SDValue(); - if (NumElements > 2) + if (MemVT.isVector()) return SplitVectorLoad(Op, DAG); - - // SI has a hardware bug in the LDS / GDS boounds checking: if the base - // address is negative, then the instruction is incorrectly treated as - // out-of-bounds even if base + offsets is in bounds. Split vectorized - // loads here to avoid emitting ds_read2_b32. We may re-combine the - // load later in the SILoadStoreOptimizer. - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - NumElements == 2 && MemVT.getStoreSize() == 8 && - Load->getAlignment() < 8) { - return SplitVectorLoad(Op, DAG); - } } if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), @@ -8514,7 +8728,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.getSizeInBits() == 128) + if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) return splitTernaryVectorOp(Op, DAG); assert(VT.getSizeInBits() == 64); @@ -8946,13 +9160,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { unsigned AS = Store->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && - Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) { + Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { return SplitVectorStore(Op, DAG); } MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - // If there is a possibilty that flat instruction access scratch memory + // If there is a possibility that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) @@ -8990,39 +9204,21 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_write_b128 or ds_write_b96 when possible. - if (Subtarget->hasDS96AndDS128() && - ((Subtarget->useDS128() && VT.getStoreSize() == 16) || - (VT.getStoreSize() == 12)) && - allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, - Store->getAlign())) + bool Fast = false; + auto Flags = Store->getMemOperand()->getFlags(); + if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, + Store->getAlign(), Flags, &Fast) && + Fast) return SDValue(); - if (NumElements > 2) + if (VT.isVector()) return SplitVectorStore(Op, DAG); - // SI has a hardware bug in the LDS / GDS boounds checking: if the base - // address is negative, then the instruction is incorrectly treated as - // out-of-bounds even if base + offsets is in bounds. Split vectorized - // stores here to avoid emitting ds_write2_b32. We may re-combine the - // store later in the SILoadStoreOptimizer. - if (!Subtarget->hasUsableDSOffset() && - NumElements == 2 && VT.getStoreSize() == 8 && - Store->getAlignment() < 8) { - return SplitVectorStore(Op, DAG); - } - - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) { - if (VT.isVector()) - return SplitVectorStore(Op, DAG); - return expandUnalignedStore(Store, DAG); - } - - return SDValue(); - } else { - llvm_unreachable("unhandled address space"); + return expandUnalignedStore(Store, DAG); } + + // Probably an invalid store. If so we'll end up emitting a selection error. + return SDValue(); } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -10041,7 +10237,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( } } - // If one half is undef, and one is constant, perfer a splat vector rather + // If one half is undef, and one is constant, prefer a splat vector rather // than the normal qNaN. If it's a register, prefer 0.0 since that's // cheaper to use and may be free with a packed operation. if (NewElts[0].isUndef()) { @@ -10349,7 +10545,8 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, // expanded into a set of cmp/select instructions. bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, - bool IsDivergentIdx) { + bool IsDivergentIdx, + const GCNSubtarget *Subtarget) { if (UseDivergentRegisterIndexing) return false; @@ -10371,10 +10568,18 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, // Large vectors would yield too many compares and v_cndmask_b32 instructions. unsigned NumInsts = NumElem /* Number of compares */ + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; - return NumInsts <= 16; + + // On some architectures (GFX9) movrel is not available and it's better + // to expand. + if (!Subtarget->hasMovrel()) + return NumInsts <= 16; + + // If movrel is available, use it instead of expanding for vector of 8 + // elements. + return NumInsts <= 15; } -static bool shouldExpandVectorDynExt(SDNode *N) { +bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { SDValue Idx = N->getOperand(N->getNumOperands() - 1); if (isa<ConstantSDNode>(Idx)) return false; @@ -10385,8 +10590,8 @@ static bool shouldExpandVectorDynExt(SDNode *N) { unsigned EltSize = EltVT.getSizeInBits(); unsigned NumElem = VecVT.getVectorNumElements(); - return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, - Idx->isDivergent()); + return SITargetLowering::shouldExpandVectorDynExt( + EltSize, NumElem, Idx->isDivergent(), getSubtarget()); } SDValue SITargetLowering::performExtractVectorEltCombine( @@ -10450,7 +10655,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( unsigned EltSize = EltVT.getSizeInBits(); // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) - if (::shouldExpandVectorDynExt(N)) { + if (shouldExpandVectorDynExt(N)) { SDLoc SL(N); SDValue Idx = N->getOperand(1); SDValue V; @@ -10513,7 +10718,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N, // INSERT_VECTOR_ELT (<n x e>, var-idx) // => BUILD_VECTOR n x select (e, const-idx) - if (!::shouldExpandVectorDynExt(N)) + if (!shouldExpandVectorDynExt(N)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -10603,39 +10808,145 @@ static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); } -SDValue SITargetLowering::performAddCombine(SDNode *N, +// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high +// multiplies, if any. +// +// Full 64-bit multiplies that feed into an addition are lowered here instead +// of using the generic expansion. The generic expansion ends up with +// a tree of ADD nodes that prevents us from using the "add" part of the +// MAD instruction. The expansion produced here results in a chain of ADDs +// instead of a tree. +SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { + assert(N->getOpcode() == ISD::ADD); + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); SDLoc SL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) - && Subtarget->hasMad64_32() && - !VT.isVector() && VT.getScalarSizeInBits() > 32 && - VT.getScalarSizeInBits() <= 64) { - if (LHS.getOpcode() != ISD::MUL) - std::swap(LHS, RHS); + if (VT.isVector()) + return SDValue(); + + // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall + // result in scalar registers for uniform values. + if (!N->isDivergent() && Subtarget->hasSMulHi()) + return SDValue(); + + unsigned NumBits = VT.getScalarSizeInBits(); + if (NumBits <= 32 || NumBits > 64) + return SDValue(); + + if (LHS.getOpcode() != ISD::MUL) { + assert(RHS.getOpcode() == ISD::MUL); + std::swap(LHS, RHS); + } + + // Avoid the fold if it would unduly increase the number of multiplies due to + // multiple uses, except on hardware with full-rate multiply-add (which is + // part of full-rate 64-bit ops). + if (!Subtarget->hasFullRate64Ops()) { + unsigned NumUsers = 0; + for (SDNode *Use : LHS->uses()) { + // There is a use that does not feed into addition, so the multiply can't + // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. + if (Use->getOpcode() != ISD::ADD) + return SDValue(); + + // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer + // MUL + 3xADD + 3xADDC over 3xMAD. + ++NumUsers; + if (NumUsers >= 3) + return SDValue(); + } + } + + SDValue MulLHS = LHS.getOperand(0); + SDValue MulRHS = LHS.getOperand(1); + SDValue AddRHS = RHS; + + // Always check whether operands are small unsigned values, since that + // knowledge is useful in more cases. Check for small signed values only if + // doing so can unlock a shorter code sequence. + bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32; + bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32; + + bool MulSignedLo = false; + if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { + MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 && + numBitsSigned(MulRHS, DAG) <= 32; + } + + // The operands and final result all have the same number of bits. If + // operands need to be extended, they can be extended with garbage. The + // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is + // truncated away in the end. + if (VT != MVT::i64) { + MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); + MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); + AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); + } + + // The basic code generated is conceptually straightforward. Pseudo code: + // + // accum = mad_64_32 lhs.lo, rhs.lo, accum + // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi + // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi + // + // The second and third lines are optional, depending on whether the factors + // are {sign,zero}-extended or not. + // + // The actual DAG is noisier than the pseudo code, but only due to + // instructions that disassemble values into low and high parts, and + // assemble the final result. + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); + auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); + SDValue Accum = + getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); - SDValue MulLHS = LHS.getOperand(0); - SDValue MulRHS = LHS.getOperand(1); - SDValue AddRHS = RHS; + if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { + auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero); + auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One); - // TODO: Maybe restrict if SGPR inputs. - if (numBitsUnsigned(MulLHS, DAG) <= 32 && - numBitsUnsigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); + if (!MulLHSUnsigned32) { + auto MulLHSHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); + SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); + AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); } - if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); + if (!MulRHSUnsigned32) { + auto MulRHSHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); + SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); + AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); + } + + Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); + Accum = DAG.getBitcast(MVT::i64, Accum); + } + + if (VT != MVT::i64) + Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum); + return Accum; +} + +SDValue SITargetLowering::performAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; } return SDValue(); @@ -10763,7 +11074,7 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SDValue RHS = N->getOperand(1); // These should really be instruction patterns, but writing patterns with - // source modiifiers is a pain. + // source modifiers is a pain. // fadd (fadd (a, a), b) -> mad 2.0, a, b if (LHS.getOpcode() == ISD::FADD) { @@ -10860,8 +11171,8 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, return SDValue(); // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, - // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract - // is sufficient to allow generaing fdot2. + // regardless of the denorm mode setting. Therefore, + // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || (N->getFlags().hasAllowContract() && @@ -11562,7 +11873,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const { if (DstSize < InitIdx) return; - // Create a register for the intialization value. + // Create a register for the initialization value. Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); unsigned NewDst = 0; // Final initialized value will be in here @@ -11608,7 +11919,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, TII->legalizeOperandsVOP3(MRI, MI); // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better ballance register + // This saves a chain-copy of registers and better balance register // use between vgpr and agpr as agpr tuples tend to be big. if (MI.getDesc().OpInfo) { unsigned Opc = MI.getOpcode(); @@ -11633,54 +11944,29 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // so no use checks are needed. MRI.setRegClass(Op.getReg(), NewRC); } - } - - return; - } - // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); - if (NoRetAtomicOp != -1) { - if (!Node->hasAnyUseOfValue(0)) { - int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::cpol); - if (CPolIdx != -1) { - MachineOperand &CPol = MI.getOperand(CPolIdx); - CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC); + // Resolve the rest of AV operands to AGPRs. + if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { + if (Src2->isReg() && Src2->getReg().isVirtual()) { + auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); + if (TRI->isVectorSuperClass(RC)) { + auto *NewRC = TRI->getEquivalentAGPRClass(RC); + MRI.setRegClass(Src2->getReg(), NewRC); + if (Src2->isTied()) + MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); + } + } } - MI.RemoveOperand(0); - MI.setDesc(TII->get(NoRetAtomicOp)); - return; } - // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg - // instruction, because the return type of these instructions is a vec2 of - // the memory type, so it can be tied to the input operand. - // This means these instructions always have a use, so we need to add a - // special case to check if the atomic has only one extract_subreg use, - // which itself has no uses. - if ((Node->hasNUsesOfValue(1, 0) && - Node->use_begin()->isMachineOpcode() && - Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && - !Node->use_begin()->hasAnyUseOfValue(0))) { - Register Def = MI.getOperand(0).getReg(); - - // Change this into a noret atomic. - MI.setDesc(TII->get(NoRetAtomicOp)); - MI.RemoveOperand(0); - - // If we only remove the def operand from the atomic instruction, the - // extract_subreg will be left with a use of a vreg without a def. - // So we need to insert an implicit_def to avoid machine verifier - // errors. - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), Def); - } return; } - if (TII->isMIMG(MI) && !MI.mayStore()) - AddIMGInit(MI); + if (TII->isMIMG(MI)) { + if (!MI.mayStore()) + AddIMGInit(MI); + TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); + } } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, @@ -12243,13 +12529,17 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { MachineBasicBlock *Exit = ML->getExitBlock(); if (Pre && Exit) { - BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(), - TII->get(AMDGPU::S_INST_PREFETCH)) - .addImm(1); // prefetch 2 lines behind PC + auto PreTerm = Pre->getFirstTerminator(); + if (PreTerm == Pre->begin() || + std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) + BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(1); // prefetch 2 lines behind PC - BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(), - TII->get(AMDGPU::S_INST_PREFETCH)) - .addImm(2); // prefetch 1 line behind PC + auto ExitHead = Exit->getFirstNonDebugInstr(); + if (ExitHead == Exit->end() || + ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) + BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(2); // prefetch 1 line behind PC } return CacheLineAlign; @@ -12390,6 +12680,9 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + unsigned AS = RMW->getPointerAddressSpace(); + if (AS == AMDGPUAS::PRIVATE_ADDRESS) + return AtomicExpansionKind::NotAtomic; auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); @@ -12421,10 +12714,11 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; - unsigned AS = RMW->getPointerAddressSpace(); - if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && - Subtarget->hasAtomicFaddInsts()) { + Subtarget->hasAtomicFaddNoRtnInsts()) { + if (Subtarget->hasGFX940Insts()) + return AtomicExpansionKind::None; + // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, // but may not respect rounding and denormal modes, and may give incorrect @@ -12453,8 +12747,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { : AtomicExpansionKind::CmpXChg; } - // DS FP atomics do repect the denormal mode, but the rounding mode is fixed - // to round-to-nearest-even. + // DS FP atomics do respect the denormal mode, but the rounding mode is + // fixed to round-to-nearest-even. // The only exception is DS_ADD_F64 which never flushes regardless of mode. if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) { if (!Ty->isDoubleTy()) @@ -12479,6 +12773,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS + ? AtomicExpansionKind::NotAtomic + : AtomicExpansionKind::None; +} + +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS + ? AtomicExpansionKind::NotAtomic + : AtomicExpansionKind::None; +} + +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { + return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS + ? AtomicExpansionKind::NotAtomic + : AtomicExpansionKind::None; +} + const TargetRegisterClass * SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); @@ -12500,7 +12815,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { // always uniform. static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, unsigned WaveSize) { - // FIXME: We asssume we never cast the mask results of a control flow + // FIXME: We assume we never cast the mask results of a control flow // intrinsic. // Early exit if the type won't be consistent as a compile time hack. IntegerType *IT = dyn_cast<IntegerType>(V->getType()); @@ -12604,7 +12919,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const { if (!N0.hasOneUse()) return false; - // Take care of the oportunity to keep N0 uniform + // Take care of the opportunity to keep N0 uniform if (N0->isDivergent() || !N1->isDivergent()) return true; // Check if we have a good chance to form the memory access pattern with the @@ -12612,3 +12927,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, return (DAG.isBaseWithConstantOffset(N0) && hasMemSDNodeUser(*N0->use_begin())); } + +MachineMemOperand::Flags +SITargetLowering::getTargetMMOFlags(const Instruction &I) const { + // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. + if (I.getMetadata("amdgpu.noclobber")) + return MONoClobber; + return MachineMemOperand::MONone; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index bf81e082b478..4fbccf0c5850 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -53,6 +53,9 @@ private: uint64_t Offset, Align Alignment, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, const SDLoc &DL, + Align Alignment, + ImplicitParameter Param) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, @@ -76,6 +79,9 @@ private: SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; + SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, + const ArgDescriptor &ArgDesc) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; @@ -145,6 +151,7 @@ private: SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; @@ -191,6 +198,7 @@ private: SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; + SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -227,7 +235,10 @@ public: /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be /// expanded into a set of cmp/select instructions. static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, - bool IsDivergentIdx); + bool IsDivergentIdx, + const GCNSubtarget *Subtarget); + + bool shouldExpandVectorDynExt(SDNode *N) const; private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the @@ -310,6 +321,9 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; @@ -380,6 +394,7 @@ public: MachineBasicBlock *BB) const override; bool hasBitPreservingFPLogic(EVT VT) const override; + bool hasAtomicFaddRtnForTy(SDValue &Op) const; bool enableAggressiveFMAFusion(EVT VT) const override; bool enableAggressiveFMAFusion(LLT Ty) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, @@ -466,6 +481,10 @@ public: bool SNaN = false, unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + AtomicExpansionKind + shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override; @@ -505,6 +524,9 @@ public: std::pair<InstructionCost, MVT> getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const; + + MachineMemOperand::Flags + getTargetMMOFlags(const Instruction &I) const override; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index 125f006a1d1d..50f8ad4433c6 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -35,6 +35,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; @@ -42,11 +43,39 @@ using namespace llvm; namespace { +// A clause length of 64 instructions could be encoded in the s_clause +// instruction, but the hardware documentation (at least for GFX11) says that +// 63 is the maximum allowed. +constexpr unsigned MaxInstructionsInClause = 63; + enum HardClauseType { + // For GFX10: + // Texture, buffer, global or scratch memory instructions. HARDCLAUSE_VMEM, // Flat (not global or scratch) memory instructions. HARDCLAUSE_FLAT, + + // For GFX11: + + // Texture memory instructions. + HARDCLAUSE_MIMG_LOAD, + HARDCLAUSE_MIMG_STORE, + HARDCLAUSE_MIMG_ATOMIC, + HARDCLAUSE_MIMG_SAMPLE, + // Buffer, global or scratch memory instructions. + HARDCLAUSE_VMEM_LOAD, + HARDCLAUSE_VMEM_STORE, + HARDCLAUSE_VMEM_ATOMIC, + // Flat (not global or scratch) memory instructions. + HARDCLAUSE_FLAT_LOAD, + HARDCLAUSE_FLAT_STORE, + HARDCLAUSE_FLAT_ATOMIC, + // BVH instructions. + HARDCLAUSE_BVH, + + // Common: + // Instructions that access LDS. HARDCLAUSE_LDS, // Scalar memory instructions. @@ -78,19 +107,43 @@ public: } HardClauseType getHardClauseType(const MachineInstr &MI) { - - // On current architectures we only get a benefit from clausing loads. - if (MI.mayLoad()) { - if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { - if (ST->hasNSAClauseBug()) { + if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { + if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { + if (ST->hasNSAClauseBug()) { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) + return HARDCLAUSE_ILLEGAL; + } + return HARDCLAUSE_VMEM; + } + if (SIInstrInfo::isFLAT(MI)) + return HARDCLAUSE_FLAT; + } else { + assert(ST->getGeneration() >= AMDGPUSubtarget::GFX11); + if (SIInstrInfo::isMIMG(MI)) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA) - return HARDCLAUSE_ILLEGAL; + const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + if (BaseInfo->BVH) + return HARDCLAUSE_BVH; + if (BaseInfo->Sampler) + return HARDCLAUSE_MIMG_SAMPLE; + return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC + : HARDCLAUSE_MIMG_LOAD + : HARDCLAUSE_MIMG_STORE; + } + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) { + return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC + : HARDCLAUSE_VMEM_LOAD + : HARDCLAUSE_VMEM_STORE; + } + if (SIInstrInfo::isFLAT(MI)) { + return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_FLAT_ATOMIC + : HARDCLAUSE_FLAT_LOAD + : HARDCLAUSE_FLAT_STORE; } - return HARDCLAUSE_VMEM; } - if (SIInstrInfo::isFLAT(MI)) - return HARDCLAUSE_FLAT; // TODO: LDS if (SIInstrInfo::isSMRD(MI)) return HARDCLAUSE_SMEM; @@ -129,7 +182,7 @@ public: bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { if (CI.First == CI.Last) return false; - assert(CI.Length <= 64 && "Hard clause is too long!"); + assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!"); auto &MBB = *CI.First->getParent(); auto ClauseMI = @@ -170,7 +223,7 @@ public: } } - if (CI.Length == 64 || + if (CI.Length == MaxInstructionsInClause || (CI.Length && Type != HARDCLAUSE_INTERNAL && Type != HARDCLAUSE_IGNORE && (Type != CI.Type || diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f8a10bc8ef6f..349bcbf82195 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -87,29 +88,29 @@ struct RegisterEncoding { }; enum WaitEventType { - VMEM_ACCESS, // vector-memory read & write - VMEM_READ_ACCESS, // vector-memory read - VMEM_WRITE_ACCESS,// vector-memory write - LDS_ACCESS, // lds read & write - GDS_ACCESS, // gds read & write - SQ_MESSAGE, // send message - SMEM_ACCESS, // scalar-memory read & write - EXP_GPR_LOCK, // export holding on its data src - GDS_GPR_LOCK, // GDS holding on its data and addr src - EXP_POS_ACCESS, // write to export position - EXP_PARAM_ACCESS, // write to export parameter - VMW_GPR_LOCK, // vector-memory write holding on its data src + VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_WRITE_ACCESS, // vector-memory write + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + EXP_LDS_ACCESS, // read by ldsdir counting as export NUM_WAIT_EVENTS, }; static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { - (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), - (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | - (1 << SQ_MESSAGE), - (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | - (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), - (1 << VMEM_WRITE_ACCESS) -}; + (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), + (1 << VMEM_WRITE_ACCESS)}; // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs @@ -119,10 +120,10 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. - AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets. + AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. - EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. + EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes. NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; @@ -355,6 +356,8 @@ private: DenseSet<MachineInstr *> TrackedWaitcntSet; DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; + DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; struct BlockInfo { @@ -381,6 +384,9 @@ public: (void)ForceVMCounter; } + bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); + bool isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -389,6 +395,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); AU.addRequired<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -431,14 +438,23 @@ public: bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt); + bool generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); + bool generateWaitcnt(AMDGPU::Waitcnt Wait, + MachineBasicBlock::instr_iterator It, + MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, const MachineInstr *MI); + AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It); }; } // end anonymous namespace @@ -496,6 +512,14 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, } } +// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written +// can be accessed. A load from LDS to VMEM does not need a wait. +static bool mayWriteLDSThroughDMA(const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI) && + (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) && + MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD; +} + void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, @@ -588,6 +612,12 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), CurrScore); } + } else if (TII->isLDSDIR(Inst)) { + // LDSDIR instructions attach the score to the destination. + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), + CurrScore); } else { if (TII->isEXP(Inst)) { // For export the destination registers are really temps that @@ -644,7 +674,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, setRegScore(RegNo, T, CurrScore); } } - if (TII->isDS(Inst) && Inst.mayStore()) { + if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) { setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); } } @@ -784,6 +814,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -796,53 +827,53 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() { return new SIInsertWaitcnts(); } -/// Combine consecutive waitcnt instructions that precede \p MI and follow +/// Combine consecutive waitcnt instructions that precede \p It and follow /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added /// by previous passes. Currently this pass conservatively assumes that these /// preexisting waitcnt are required for correctness. -bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, - MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, - const MachineInstr *MI) { +bool SIInsertWaitcnts::applyPreexistingWaitcnt( + WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) { bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; - for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); - &*II != MI; II = NextI, ++NextI) { - if (II->isMetaInstruction()) + + for (auto &II : + make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { + if (II.isMetaInstruction()) continue; - if (II->getOpcode() == AMDGPU::S_WAITCNT) { + if (II.getOpcode() == AMDGPU::S_WAITCNT) { // Conservatively update required wait if this waitcnt was added in an // earlier pass. In this case it will not exist in the tracked waitcnt // set. - if (!TrackedWaitcntSet.count(&*II)) { - unsigned IEnc = II->getOperand(0).getImm(); + if (!TrackedWaitcntSet.count(&II)) { + unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); Wait = Wait.combined(OldWait); } // Merge consecutive waitcnt of the same type by erasing multiples. if (!WaitcntInstr) { - WaitcntInstr = &*II; + WaitcntInstr = &II; } else { - II->eraseFromParent(); + II.eraseFromParent(); Modified = true; } } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - if (!TrackedWaitcntSet.count(&*II)) { + assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); + if (!TrackedWaitcntSet.count(&II)) { unsigned OldVSCnt = - TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); } if (!WaitcntVsCntInstr) { - WaitcntVsCntInstr = &*II; + WaitcntVsCntInstr = &II; } else { - II->eraseFromParent(); + II.eraseFromParent(); Modified = true; } } @@ -862,9 +893,14 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, Wait.LgkmCnt = ~0u; Wait.ExpCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr - << '\n'); + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitcntInstr + << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntInstr << '\n'); + } else { WaitcntInstr->eraseFromParent(); Modified = true; @@ -885,9 +921,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, ScoreBrackets.applyWaitcnt(Wait); Wait.VsCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << *MI - << "New Instr: " << *WaitcntVsCntInstr << '\n'); + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *WaitcntVsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntVsCntInstr << '\n'); } else { WaitcntVsCntInstr->eraseFromParent(); Modified = true; @@ -928,16 +968,18 @@ static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -bool SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { +/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to +/// flush the vmcnt counter here. +bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt) { setForceEmitWaitcnt(); if (MI.isMetaInstruction()) return false; AMDGPU::Waitcnt Wait; - bool Modified = false; // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to @@ -955,16 +997,17 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || - MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && - ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == - AMDGPU::SendMsg::ID_GS_DONE)) { + ST->hasLegacyGeometry() && + ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == + AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { Wait.VmCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. @@ -1040,7 +1083,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { // The function is going to insert a wait on everything in its prolog. // This still needs to be careful if the call target is a load (e.g. a GOT - // load). We also need to check WAW depenancy with saved PC. + // load). We also need to check WAW dependency with saved PC. Wait = AMDGPU::Waitcnt(); int CallAddrOpIdx = @@ -1089,7 +1132,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( SLoadAddresses.erase(Ptr); } unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) + continue; + // No need to wait before load from VMEM to LDS. + if (mayWriteLDSThroughDMA(MI)) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. @@ -1123,7 +1169,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); ScoreBrackets.clearVgprVmemTypes(RegNo); } - if (Op.isDef()) { + if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } @@ -1170,47 +1216,93 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; - if (OldWaitcntInstr) { + if (FlushVmCnt) { + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0) + Wait.VmCnt = 0; + } + + return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, + OldWaitcntInstr); +} + +// Add a waitcnt to flush the vmcnt counter at the end of the given block if +// needed. +bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + AMDGPU::Waitcnt Wait; + + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB == 0) + return false; + + Wait.VmCnt = 0; + + return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, + OldWaitcntInstr); +} + +bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, + MachineBasicBlock::instr_iterator It, + MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + if (OldWaitcntInstr) // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. Modified = - applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI); - } else { - // Update waitcnt brackets after determining the required wait. + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); + else ScoreBrackets.applyWaitcnt(Wait); + + // ExpCnt can be merged into VINTERP. + if (Wait.ExpCnt != ~0u && It != Block.instr_end() && + SIInstrInfo::isVINTERP(*It)) { + MachineOperand *WaitExp = + TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); + if (Wait.ExpCnt < WaitExp->getImm()) { + WaitExp->setImm(Wait.ExpCnt); + Modified = true; + } + Wait.ExpCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Update Instr: " << *It); } // Build new waitcnt instructions unless no wait is needed or the old waitcnt // instruction was modified to handle the required wait. if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); + auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); TrackedWaitcntSet.insert(SWaitInst); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); - auto SWaitInst = - BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.VsCnt); + auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.VsCnt); TrackedWaitcntSet.insert(SWaitInst); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } - return Modified; } @@ -1338,6 +1430,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // May need to way wait for anything. ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); } + } else if (SIInstrInfo::isLDSDIR(Inst)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst); + } else if (TII->isVINTERP(Inst)) { + int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); + ScoreBrackets->applyWaitcnt(EXP_CNT, Imm); } else if (SIInstrInfo::isEXP(Inst)) { unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) @@ -1349,6 +1446,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSG_RTN_B32: + case AMDGPU::S_SENDMSG_RTN_B64: case AMDGPU::S_SENDMSGHALT: ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst); break; @@ -1476,8 +1575,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, continue; } + bool FlushVmCnt = Block.getFirstTerminator() == Inst && + isPreheaderToFlush(Block, ScoreBrackets); + // Generate an s_waitcnt instruction to be placed before Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, + FlushVmCnt); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. @@ -1562,9 +1665,101 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ++Iter; } + if (Block.getFirstTerminator() == Block.end() && + isPreheaderToFlush(Block, ScoreBrackets)) + Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + return Modified; } +// Return true if the given machine basic block is a preheader of a loop in +// which we want to flush the vmcnt counter, and false otherwise. +bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets) { + if (PreheadersToFlush.count(&MBB)) + return PreheadersToFlush[&MBB]; + + auto UpdateCache = [&](bool val) { + PreheadersToFlush[&MBB] = val; + return val; + }; + + MachineBasicBlock *Succ = MBB.getSingleSuccessor(); + if (!Succ) + return UpdateCache(false); + + MachineLoop *Loop = MLI->getLoopFor(Succ); + if (!Loop) + return UpdateCache(false); + + if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets)) + return UpdateCache(true); + + return UpdateCache(false); +} + +// Return true if it is better to flush the vmcnt counter in the preheader of +// the given loop. We currently decide to flush in two situations: +// 1. The loop contains vmem store(s), no vmem load and at least one use of a +// vgpr containing a value that is loaded outside of the loop. (Only on +// targets with no vscnt counter). +// 2. The loop contains vmem load(s), but the loaded values are not used in the +// loop, and at least one use of a vgpr containing a value that is loaded +// outside of the loop. +bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, + WaitcntBrackets &Brackets) { + bool HasVMemLoad = false; + bool HasVMemStore = false; + bool UsesVgprLoadedOutside = false; + DenseSet<Register> VgprUse; + DenseSet<Register> VgprDef; + + for (MachineBasicBlock *MBB : ML->blocks()) { + for (MachineInstr &MI : *MBB) { + if (SIInstrInfo::isVMEM(MI)) { + if (MI.mayLoad()) + HasVMemLoad = true; + if (MI.mayStore()) + HasVMemStore = true; + } + for (unsigned I = 0; I < MI.getNumOperands(); I++) { + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + continue; + RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); + // Vgpr use + if (Op.isUse()) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprDef.contains(RegNo)) + return false; + VgprUse.insert(RegNo); + // If at least one of Op's registers is in the score brackets, the + // value is likely loaded outside of the loop. + if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + UsesVgprLoadedOutside = true; + break; + } + } + } + // VMem load vgpr def + else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprUse.contains(RegNo)) + return false; + VgprDef.insert(RegNo); + } + } + } + } + if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) + return true; + return HasVMemLoad && UsesVgprLoadedOutside; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); @@ -1572,6 +1767,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MLI = &getAnalysis<MachineLoopInfo>(); PDT = &getAnalysis<MachinePostDominatorTree>(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index e39f52875f1f..b398e108bf62 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -48,6 +48,12 @@ class InstSI <dag outs, dag ins, string asm = "", field bit VGPRSpill = 0; field bit SGPRSpill = 0; + // LDSDIR instruction format. + field bit LDSDIR = 0; + + // VINTERP instruction format. + field bit VINTERP = 0; + // High bits - other information. field bit VM_CNT = 0; field bit EXP_CNT = 0; @@ -141,6 +147,9 @@ class InstSI <dag outs, dag ins, string asm = "", // Atomic with return. field bit IsAtomicRet = 0; + // This bit indicates that this is one of WMMA instructions. + field bit IsWMMA = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -173,6 +182,9 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{24} = VGPRSpill; let TSFlags{25} = SGPRSpill; + let TSFlags{26} = LDSDIR; + let TSFlags{27} = VINTERP; + let TSFlags{32} = VM_CNT; let TSFlags{33} = EXP_CNT; let TSFlags{34} = LGKM_CNT; @@ -215,6 +227,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{58} = IsAtomicRet; + let TSFlags{59} = IsWMMA; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; @@ -261,6 +275,11 @@ class Enc64 { int Size = 8; } +class Enc96 { + field bits<96> Inst; + int Size = 12; +} + def CPolBit { int GLC = 0; int SLC = 1; @@ -284,7 +303,7 @@ class VINTRPe <bits<2> op> : Enc32 { let Inst{31-26} = 0x32; // encoding } -class MIMGe : Enc64 { +class MIMGe_gfxpre11 : Enc64 { bits<10> vdata; bits<4> dmask; bits<1> unorm; @@ -309,7 +328,7 @@ class MIMGe : Enc64 { let Inst{63} = d16; } -class MIMGe_gfx6789 <bits<8> op> : MIMGe { +class MIMGe_gfx6789 <bits<8> op> : MIMGe_gfxpre11 { bits<8> vaddr; bits<1> da; @@ -321,7 +340,7 @@ class MIMGe_gfx6789 <bits<8> op> : MIMGe { let Inst{39-32} = vaddr; } -class MIMGe_gfx90a <bits<8> op> : MIMGe { +class MIMGe_gfx90a <bits<8> op> : MIMGe_gfxpre11 { bits<8> vaddr; bits<1> da; @@ -333,7 +352,7 @@ class MIMGe_gfx90a <bits<8> op> : MIMGe { let Inst{39-32} = vaddr; } -class MIMGe_gfx10 <bits<8> op> : MIMGe { +class MIMGe_gfx10 <bits<8> op> : MIMGe_gfxpre11 { bits<8> vaddr0; bits<3> dim; bits<2> nsa; @@ -349,12 +368,46 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe { let Inst{62} = a16; } +class MIMGe_gfx11 <bits<8> op> : Enc64 { + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<5> cpol; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<7> srsrc; + bits<7> ssamp; + bit d16; + bits<1> a16; + bits<8> vaddr0; + bits<3> dim; + bits<1> nsa; + + let Inst{0} = nsa; + let Inst{4-2} = dim; + let Inst{7} = unorm; + let Inst{11-8} = dmask; + let Inst{12} = cpol{CPolBit.SLC}; + let Inst{13} = cpol{CPolBit.DLC}; + let Inst{14} = cpol{CPolBit.GLC}; + let Inst{15} = r128; + let Inst{16} = a16; + let Inst{17} = d16; + let Inst{25-18} = op; + let Inst{31-26} = 0x3c; + let Inst{39-32} = vaddr0; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{53} = tfe; + let Inst{54} = lwe; + let Inst{62-58} = ssamp{6-2}; +} + class EXPe : Enc64 { bits<4> en; bits<6> tgt; - bits<1> compr; bits<1> done; - bits<1> vm; bits<8> src0; bits<8> src1; bits<8> src2; @@ -362,9 +415,7 @@ class EXPe : Enc64 { let Inst{3-0} = en; let Inst{9-4} = tgt; - let Inst{10} = compr; let Inst{11} = done; - let Inst{12} = vm; let Inst{31-26} = 0x3e; let Inst{39-32} = src0; let Inst{47-40} = src1; @@ -372,6 +423,22 @@ class EXPe : Enc64 { let Inst{63-56} = src3; } +// Pre-GFX11 encoding has compr and vm bits. +class EXPe_ComprVM : EXPe { + bits<1> compr; + bits<1> vm; + + let Inst{10} = compr; + let Inst{12} = vm; +} + +// GFX11+ encoding has row bit. +class EXPe_Row : EXPe { + bits<1> row; + + let Inst{13} = row; +} + let Uses = [EXEC] in { class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0a2f9381e71f..814a7c446889 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -16,12 +16,12 @@ #include "AMDGPUInstrInfo.h" #include "GCNHazardRecognizer.h" #include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -130,9 +130,31 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return false; } -static bool readsExecAsData(const MachineInstr &MI) { - if (MI.isCompare()) - return true; +// Returns true if the scalar result of a VALU instruction depends on exec. +static bool resultDependsOnExec(const MachineInstr &MI) { + // Ignore comparisons which are only used masked with exec. + // This allows some hoisting/sinking of VALU comparisons. + if (MI.isCompare()) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + Register DstReg = MI.getOperand(0).getReg(); + if (!DstReg.isVirtual()) + return true; + for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { + switch (Use.getOpcode()) { + case AMDGPU::S_AND_SAVEEXEC_B32: + case AMDGPU::S_AND_SAVEEXEC_B64: + break; + case AMDGPU::S_AND_B32: + case AMDGPU::S_AND_B64: + if (!Use.readsRegister(AMDGPU::EXEC)) + return true; + break; + default: + return true; + } + } + return false; + } switch (MI.getOpcode()) { default: @@ -147,7 +169,7 @@ static bool readsExecAsData(const MachineInstr &MI) { bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { // Any implicit use of exec by VALU is not a real register read. return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && - isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent()); + isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); } bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, @@ -181,7 +203,7 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (Offset0Idx == -1 || Offset1Idx == -1) return false; - // XXX - be careful of datalesss loads + // XXX - be careful of dataless loads // getNamedOperandIdx returns the index for MachineInstrs. Since they // include the output in the operand list, but SDNodes don't, we need to // subtract the index by one. @@ -362,6 +384,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + if (DataOpIdx == -1) // LDS DMA + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } @@ -410,6 +434,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + if (DataOpIdx == -1) // LDS DMA + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } @@ -464,7 +490,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, return false; } - // In order to avoid regester pressure, on an average, the number of DWORDS + // In order to avoid register pressure, on an average, the number of DWORDS // loaded together by all clustered mem ops should not exceed 8. This is an // empirical value based on certain observations and performance related // experiments. @@ -517,8 +543,9 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)); } -/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible -/// to directly copy, so an intermediate VGPR needs to be used. +/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not +/// possible to have a direct copy in these cases on GFX908, so an intermediate +/// VGPR copy is required. static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -527,10 +554,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, RegScavenger &RS, Register ImpDefSuperReg = Register(), Register ImpUseSuperReg = Register()) { - const SIRegisterInfo &RI = TII.getRegisterInfo(); + assert((TII.getSubtarget().hasMAIInsts() && + !TII.getSubtarget().hasGFX90AInsts()) && + "Expected GFX908 subtarget."); - assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || - AMDGPU::AGPR_32RegClass.contains(SrcReg)); + assert((AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)) && + "Source register of the copy should be either an SGPR or an AGPR."); + + assert(AMDGPU::AGPR_32RegClass.contains(DestReg) && + "Destination register of the copy should be an AGPR."); + + const SIRegisterInfo &RI = TII.getRegisterInfo(); // First try to find defining accvgpr_write to avoid temporary registers. for (auto Def = MI, E = MBB.begin(); Def != E; ) { @@ -581,23 +616,21 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, // Registers in the sequence are allocated contiguously so we can just // use register number to pick one of three round-robin temps. - unsigned RegNo = DestReg % 3; - Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp) - report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); - RS.setRegUsed(Tmp); + unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3; + Register Tmp = + MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy(); + assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && + "VGPR used for an intermediate copy should have been reserved."); - if (!TII.getSubtarget().hasGFX90AInsts()) { - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) - break; - Tmp = Tmp2; - RS.setRegUsed(Tmp); - } + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); } // Insert copy to temporary VGPR. @@ -796,7 +829,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RC == &AMDGPU::AGPR_32RegClass) { - if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { + if (AMDGPU::VGPR_32RegClass.contains(SrcReg) || + (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) { BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -884,6 +918,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { + if (ST.hasMovB64()) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } if (ST.hasPackedFP32Ops()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) .addImm(SISrcMods::OP_SEL_1) @@ -906,7 +945,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } - expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); + const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); + expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, + Forward); return; } @@ -915,7 +956,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (RI.isAGPRClass(RC)) { if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) Opcode = AMDGPU::V_ACCVGPR_MOV_B32; - else if (RI.hasVGPRs(SrcRC)) + else if (RI.hasVGPRs(SrcRC) || + (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC))) Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; else Opcode = AMDGPU::INSTRUCTION_LIST_END; @@ -925,7 +967,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, (RI.isProperlyAlignedRC(*RC) && (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. - if (ST.hasPackedFP32Ops()) { + if (ST.hasMovB64()) { + Opcode = AMDGPU::V_MOV_B64_e32; + EltSize = 8; + } else if (ST.hasPackedFP32Ops()) { Opcode = AMDGPU::V_PK_MOV_B32; EltSize = 8; } @@ -1725,13 +1770,8 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { case AMDGPU::S_NOP: return MI.getOperand(0).getImm() + 1; - - // FIXME: Any other pseudo instruction? // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The // hazard, even if one exist, won't really be visible. Should we handle it? - case AMDGPU::SI_MASKED_UNREACHABLE: - case AMDGPU::WAVE_BARRIER: - return 0; } } @@ -1807,6 +1847,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); + if (ST.hasMovB64()) { + MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); + if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm())) + break; + } if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); APInt Lo(32, Imm.getLoBits(32).getZExtValue()); @@ -1887,6 +1932,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + // FIXME: We may possibly optimize the COPY once we find ways to make LLVM + // optimizations (mainly Register Coalescer) aware of WWM register liveness. + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) @@ -1899,11 +1948,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); + .add(MI.getOperand(1)); + expandPostRAPseudo(*Copy); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten + Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); expandPostRAPseudo(*Copy); BuildMI(MBB, MI, DL, get(NotOpc), Exec) .addReg(Exec); @@ -2085,6 +2138,23 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } + case AMDGPU::SI_RETURN: { + const MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + // Hiding the return address use with SI_RETURN may lead to extra kills in + // the function and missing live-ins. We are fine in practice because callee + // saved register handling ensures the register value is restored before + // RET, but we need the undef flag here to appease the MachineVerifier + // liveness checks. + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) + .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); + + MIB.copyImplicitOps(MI); + MI.eraseFromParent(); + break; + } } return true; } @@ -2093,6 +2163,13 @@ std::pair<MachineInstr*, MachineInstr*> SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + if (ST.hasMovB64() && + AMDGPU::isLegal64BitDPPControl( + getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { + MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); + return std::make_pair(&MI, nullptr); + } + MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction *MF = MBB.getParent(); @@ -2789,6 +2866,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_MOV_B64_e64: case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: @@ -2801,35 +2880,15 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { } } -unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( - unsigned Kind) const { - switch(Kind) { - case PseudoSourceValue::Stack: - case PseudoSourceValue::FixedStack: - return AMDGPUAS::PRIVATE_ADDRESS; - case PseudoSourceValue::ConstantPool: - case PseudoSourceValue::GOT: - case PseudoSourceValue::JumpTable: - case PseudoSourceValue::GlobalValueCallEntry: - case PseudoSourceValue::ExternalSymbolCallEntry: - case PseudoSourceValue::TargetCustom: - return AMDGPUAS::CONSTANT_ADDRESS; - } - return AMDGPUAS::FLAT_ADDRESS; -} +static constexpr unsigned ModifierOpNames[] = { + AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, + AMDGPU::OpName::omod}; -static void removeModOperands(MachineInstr &MI) { +void SIInstrInfo::removeModOperands(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); - int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src0_modifiers); - int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src1_modifiers); - int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2_modifiers); - - MI.RemoveOperand(Src2ModIdx); - MI.RemoveOperand(Src1ModIdx); - MI.RemoveOperand(Src0ModIdx); + for (unsigned Name : reverse(ModifierOpNames)) + MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, Name)); } bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, @@ -2841,7 +2900,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, default: return false; case AMDGPU::S_MOV_B64: - // TODO: We could fold 64-bit immediates, but this get compilicated + // TODO: We could fold 64-bit immediates, but this get complicated // when there are sub-registers. return false; @@ -2921,7 +2980,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_{f16, f32}. - // We should only expect these to be on src0 due to canonicalizations. + // We should only expect these to be on src0 due to canonicalization. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; @@ -2942,12 +3001,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. - // Remove these first since they are at the end. - UseMI.RemoveOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI.RemoveOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - Register Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); Src0->setReg(Src1Reg); @@ -2966,7 +3019,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) DefMI.eraseFromParent(); @@ -3025,12 +3078,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. - // Remove these first since they are at the end. - UseMI.RemoveOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI.RemoveOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || @@ -3049,7 +3096,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // constant and SGPR are illegal. legalizeOperands(UseMI); - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) DefMI.eraseFromParent(); @@ -3192,34 +3239,68 @@ static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { + MachineBasicBlock &MBB = *MI.getParent(); unsigned Opc = MI.getOpcode(); - bool IsF16 = false; + + // Handle MFMA. + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc != -1) { + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + return MIB; + } + + if (SIInstrInfo::isWMMA(MI)) { + unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); + MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .setMIFlags(MI.getFlags()); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB->addOperand(MI.getOperand(I)); + + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + + return MIB; + } + + // Handle MAC/FMAC. + bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || + Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; - int NewMFMAOpc = -1; + bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || + Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || + Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || + Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; + bool Src0Literal = false; switch (Opc) { default: - NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); - if (NewMFMAOpc == -1) - return nullptr; - break; + return nullptr; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: - IsF16 = true; - LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F64_e64: break; case AMDGPU::V_MAC_F16_e32: case AMDGPU::V_FMAC_F16_e32: - IsF16 = true; - LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_LEGACY_F32_e32: case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_LEGACY_F32_e32: case AMDGPU::V_FMAC_F64_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -3228,25 +3309,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return nullptr; if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) - return nullptr; + Src0Literal = true; break; } } MachineInstrBuilder MIB; - MachineBasicBlock &MBB = *MI.getParent(); - - if (NewMFMAOpc != -1) { - MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); - updateLiveVariables(LV, MI, *MIB); - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *MIB); - return MIB; - } - const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src0Mods = @@ -3255,10 +3324,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, const MachineOperand *Src1Mods = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Src2Mods = + getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && + if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && + !IsLegacy && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { @@ -3271,11 +3343,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, // We cannot just remove the DefMI here, calling pass will crash. DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) - DefMI->RemoveOperand(I); + DefMI->removeOperand(I); }; int64_t Imm; - if (getFoldableImm(Src2, Imm, &DefMI)) { + if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); @@ -3295,7 +3367,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); - if (getFoldableImm(Src1, Imm, &DefMI)) { + if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -3309,7 +3381,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } } - if (getFoldableImm(Src0, Imm, &DefMI)) { + if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { + if (Src0Literal) { + Imm = Src0->getImm(); + DefMI = nullptr; + } if (pseudoToMCOpcode(NewOpc) != -1 && isOperandLegal( MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), @@ -3322,16 +3398,27 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); - killDef(); + if (DefMI) + killDef(); return MIB; } } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 - : IsF64 ? AMDGPU::V_FMA_F64_e64 - : AMDGPU::V_FMA_F32_e64) - : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); + // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma + // because VOP3 does not allow a literal operand. + // TODO: Remove this restriction for GFX10. + if (Src0Literal) + return nullptr; + + unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 + : IsF64 ? AMDGPU::V_FMA_F64_e64 + : IsLegacy + ? AMDGPU::V_FMA_LEGACY_F32_e64 + : AMDGPU::V_FMA_F32_e64 + : IsF16 ? AMDGPU::V_MAD_F16_e64 + : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 + : AMDGPU::V_MAD_F32_e64; if (pseudoToMCOpcode(NewOpc) == -1) return nullptr; @@ -3341,7 +3428,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, .add(*Src0) .addImm(Src1Mods ? Src1Mods->getImm() : 0) .add(*Src1) - .addImm(0) // Src mods + .addImm(Src2Mods ? Src2Mods->getImm() : 0) .add(*Src2) .addImm(Clamp ? Clamp->getImm() : 0) .addImm(Omod ? Omod->getImm() : 0); @@ -3383,6 +3470,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) return true; + if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) + return true; + // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. @@ -3676,11 +3766,8 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, } bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { - return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || - hasModifiersSet(MI, AMDGPU::OpName::clamp) || - hasModifiersSet(MI, AMDGPU::OpName::omod); + return any_of(ModifierOpNames, + [&](unsigned Name) { return hasModifiersSet(MI, Name); }); } bool SIInstrInfo::canShrink(const MachineInstr &MI, @@ -3754,18 +3841,19 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, unsigned Op32) const { - MachineBasicBlock *MBB = MI.getParent();; + MachineBasicBlock *MBB = MI.getParent(); MachineInstrBuilder Inst32 = BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); - if (Op32DstIdx != -1) { + if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) { // dst Inst32.add(MI.getOperand(0)); - } else { + } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) { + // VOPCX instructions won't be writing to an explicit dst, so this should + // not fail for these instructions. assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case"); @@ -3816,7 +3904,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); // Null is free - if (MO.getReg() == AMDGPU::SGPR_NULL) + if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64) return false; // SGPRs use the constant bus @@ -3951,6 +4039,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: + case AMDGPU::OPERAND_REG_IMM_V2FP32: break; case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: @@ -4031,9 +4120,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; - - for (int OpIdx: OpIndicies) { + for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) { if (OpIdx == -1) continue; const MachineOperand &MO = MI.getOperand(OpIdx); @@ -4150,24 +4237,25 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } // Verify VOP*. Ignore multiple sgpr operands on writelane. - if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 - && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { - // Only look at the true operands. Only a real operand can use the constant - // bus, and we don't want to check pseudo-operands like the source modifier - // flags. - const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; - + if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) { unsigned ConstantBusCount = 0; bool UsesLiteral = false; const MachineOperand *LiteralVal = nullptr; - if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) + int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm); + if (ImmIdx != -1) { ++ConstantBusCount; + UsesLiteral = true; + LiteralVal = &MI.getOperand(ImmIdx); + } SmallVector<Register, 2> SGPRsUsed; Register SGPRUsed; - for (int OpIdx : OpIndices) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { if (OpIdx == -1) break; const MachineOperand &MO = MI.getOperand(OpIdx); @@ -4186,8 +4274,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, UsesLiteral = true; LiteralVal = &MO; } else if (!MO.isIdenticalTo(*LiteralVal)) { - assert(isVOP3(MI)); - ErrInfo = "VOP3 instruction uses more than one literal"; + assert(isVOP2(MI) || isVOP3(MI)); + ErrInfo = "VOP2/VOP3 instruction uses more than one literal"; return false; } } @@ -4196,7 +4284,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) { - // Implicit uses may safely overlap true overands + // Implicit uses may safely overlap true operands if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { return !RI.regsOverlap(SGPRUsed, SGPR); })) { @@ -4225,7 +4313,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, unsigned SGPRCount = 0; Register SGPRUsed = AMDGPU::NoRegister; - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx}) { if (OpIdx == -1) break; @@ -4272,16 +4360,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (isSOP2(MI) || isSOPC(MI)) { const MachineOperand &Src0 = MI.getOperand(Src0Idx); const MachineOperand &Src1 = MI.getOperand(Src1Idx); - unsigned Immediates = 0; - if (!Src0.isReg() && - !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) - Immediates++; - if (!Src1.isReg() && - !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) - Immediates++; - - if (Immediates > 1) { + if (!Src0.isReg() && !Src1.isReg() && + !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType) && + !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType) && + !Src0.isIdenticalTo(Src1)) { ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; return false; } @@ -4364,10 +4447,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (isSMRD(MI)) { - if (MI.mayStore()) { + if (MI.mayStore() && + ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { // The register offset form of scalar stores may only use m0 as the // soffset register. - const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); + const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset); if (Soff && Soff->getReg() != AMDGPU::M0) { ErrInfo = "scalar stores must use m0 as offset register"; return false; @@ -4477,7 +4561,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && ((DstIdx >= 0 && @@ -4527,24 +4610,45 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - if (ST.needsAlignedVGPRs() && - (MI.getOpcode() == AMDGPU::DS_GWS_INIT || - MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || - MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { - const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); - Register Reg = Op->getReg(); - bool Aligned = true; - if (Reg.isPhysical()) { - Aligned = !(RI.getHWRegIndex(Reg) & 1); - } else { + if (ST.needsAlignedVGPRs()) { + const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool { + const MachineOperand *Op = getNamedOperand(MI, OpName); + if (!Op) + return true; + Register Reg = Op->getReg(); + if (Reg.isPhysical()) + return !(RI.getHWRegIndex(Reg) & 1); const TargetRegisterClass &RC = *MRI.getRegClass(Reg); - Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && - !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); + return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && + !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); + }; + + if (MI.getOpcode() == AMDGPU::DS_GWS_INIT || + MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || + MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) { + + if (!isAlignedReg(AMDGPU::OpName::data0)) { + ErrInfo = "Subtarget requires even aligned vector registers " + "for DS_GWS instructions"; + return false; + } + } + + if (isMIMG(MI)) { + if (!isAlignedReg(AMDGPU::OpName::vaddr)) { + ErrInfo = "Subtarget requires even aligned vector registers " + "for vaddr operand of image instructions"; + return false; + } } + } - if (!Aligned) { - ErrInfo = "Subtarget requires even aligned vector registers " - "for DS_GWS instructions"; + if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + !ST.hasGFX90AInsts()) { + const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) { + ErrInfo = "Invalid register class: " + "v_accvgpr_write with an SGPR is not supported on this GPU"; return false; } } @@ -4641,26 +4745,40 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { "Unexpected scalar opcode without corresponding vector one!"); } -static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, - const MachineRegisterInfo &MRI, - const MCInstrDesc &TID, - unsigned RCID, - bool IsAllocatable) { +static const TargetRegisterClass * +adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, + const MachineRegisterInfo &MRI, + const MCInstrDesc &TID, unsigned RCID, + bool IsAllocatable) { if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && (((TID.mayLoad() || TID.mayStore()) && !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { switch (RCID) { - case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; - case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; - case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; - case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; - case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; + case AMDGPU::AV_32RegClassID: + RCID = AMDGPU::VGPR_32RegClassID; + break; + case AMDGPU::AV_64RegClassID: + RCID = AMDGPU::VReg_64RegClassID; + break; + case AMDGPU::AV_96RegClassID: + RCID = AMDGPU::VReg_96RegClassID; + break; + case AMDGPU::AV_128RegClassID: + RCID = AMDGPU::VReg_128RegClassID; + break; + case AMDGPU::AV_160RegClassID: + RCID = AMDGPU::VReg_160RegClassID; + break; + case AMDGPU::AV_512RegClassID: + RCID = AMDGPU::VReg_512RegClassID; + break; default: break; } } - return RCID; + + return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); } const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, @@ -4673,7 +4791,7 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, bool IsAllocatable = false; if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { // vdst and vdata should be both VGPR or AGPR, same for the DS instructions - // with two data operands. Request register class constainted to VGPR only + // with two data operands. Request register class constrained to VGPR only // of both operands present as Machine Copy Propagation can not check this // constraint and possibly other passes too. // @@ -4690,9 +4808,8 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, AMDGPU::OpName::data1) != -1; } } - RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, - IsAllocatable); - return RI.getRegClass(RegClass); + return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, + IsAllocatable); } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, @@ -4709,8 +4826,7 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, } unsigned RCID = Desc.OpInfo[OpNo].RegClass; - RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); - return RI.getRegClass(RCID); + return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { @@ -4797,7 +4913,7 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( void SIInstrInfo::swapOperands(MachineInstr &Inst) const { assert(Inst.getNumExplicitOperands() == 3); MachineOperand Op1 = Inst.getOperand(1); - Inst.RemoveOperand(1); + Inst.removeOperand(1); Inst.addOperand(Op1); } @@ -4851,9 +4967,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, MO = &MI.getOperand(OpIdx); int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); - int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { - if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) + if (isLiteralConstantLike(*MO, OpInfo) && !LiteralLimit--) return false; SmallDenseSet<RegSubRegPair> SGPRsUsed; @@ -4872,12 +4988,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; SGPRsUsed.insert(SGPR); } - } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { - if (--ConstantBusLimit <= 0) - return false; - } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && - isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { - if (!VOP3LiteralLimit--) + } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32 || + (AMDGPU::isSISrcOperand(InstDesc, i) && + isLiteralConstantLike(Op, InstDesc.OpInfo[i]))) { + if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) return false; @@ -4886,7 +5000,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } if (MO->isReg()) { - assert(DefinedRC); + if (!DefinedRC) { + // This operand allows any register. + return true; + } if (!isLegalRegOperand(MRI, OpInfo, *MO)) return false; bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); @@ -4916,7 +5033,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) return false; } - if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && RI.isSGPRReg(MRI, MO->getReg())) return false; @@ -5186,7 +5303,7 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); SBase->setReg(SGPR); } - MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); + MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); SOff->setReg(SGPR); @@ -5232,16 +5349,16 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { const MCInstrDesc &NewDesc = get(NewOpc); Inst.setDesc(NewDesc); - // Callers expect interator to be valid after this call, so modify the + // Callers expect iterator to be valid after this call, so modify the // instruction in place. if (OldVAddrIdx == NewVAddrIdx) { MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); // Clear use list from the old vaddr holding a zero register. MRI.removeRegOperandFromUseList(&NewVAddr); MRI.moveOperands(&NewVAddr, &SAddr, 1); - Inst.RemoveOperand(OldSAddrIdx); + Inst.removeOperand(OldSAddrIdx); // Update the use list with the pointer we have just moved from vaddr to - // saddr poisition. Otherwise new vaddr will be missing from the use list. + // saddr position. Otherwise new vaddr will be missing from the use list. MRI.removeRegOperandFromUseList(&NewVAddr); MRI.addRegOperandToUseList(&NewVAddr); } else { @@ -5251,14 +5368,14 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in); - // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so + // removeOperand doesn't try to fixup tied operand indexes at it goes, so // it asserts. Untie the operands for now and retie them afterwards. if (NewVDstIn != -1) { int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); Inst.untieRegOperand(OldVDstIn); } - Inst.RemoveOperand(OldVAddrIdx); + Inst.removeOperand(OldVAddrIdx); if (NewVDstIn != -1) { int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); @@ -5340,7 +5457,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, static void emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, - const DebugLoc &DL, MachineOperand &Rsrc) { + MachineBasicBlock &BodyBB, const DebugLoc &DL, + MachineOperand &Rsrc) { MachineFunction &MF = *OrigBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -5398,7 +5516,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, else Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); - // Combine the comparision results with AND. + // Combine the comparison results with AND. if (CondReg == AMDGPU::NoRegister) // First. CondReg = NewCondReg; else { // If not the first, we create an AND. @@ -5433,14 +5551,14 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, .addReg(CondReg, RegState::Kill); // The original instruction is here; we insert the terminators after it. - I = LoopBB.end(); + I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) .addReg(Exec) .addReg(SaveExec); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); + BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register @@ -5487,31 +5605,35 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, // To insert the loop we need to split the block. Move everything after this // point to a new block, and insert a new empty block between the two. MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); MachineFunction::iterator MBBI(MBB); ++MBBI; MF.insert(MBBI, LoopBB); + MF.insert(MBBI, BodyBB); MF.insert(MBBI, RemainderBB); - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(BodyBB); + BodyBB->addSuccessor(LoopBB); + BodyBB->addSuccessor(RemainderBB); - // Move Begin to MI to the LoopBB, and the remainder of the block to + // Move Begin to MI to the BodyBB, and the remainder of the block to // RemainderBB. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); - LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); + BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); MBB.addSuccessor(LoopBB); // Update dominators. We know that MBB immediately dominates LoopBB, that - // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately - // dominates all of the successors transferred to it from MBB that MBB used - // to properly dominate. + // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates + // RemainderBB. RemainderBB immediately dominates all of the successors + // transferred to it from MBB that MBB used to properly dominate. if (MDT) { MDT->addNewBlock(LoopBB, &MBB); - MDT->addNewBlock(RemainderBB, LoopBB); + MDT->addNewBlock(BodyBB, LoopBB); + MDT->addNewBlock(RemainderBB, BodyBB); for (auto &Succ : RemainderBB->successors()) { if (MDT->properlyDominates(&MBB, Succ)) { MDT->changeImmediateDominator(Succ, RemainderBB); @@ -5519,12 +5641,12 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc); // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); - return LoopBB; + return BodyBB; } // Extract pointer from Rsrc and return a zero-value Rsrc replacement. @@ -5762,7 +5884,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), RI.getRegClass(RsrcRC))) { // The operands are legal. - // FIXME: We may need to legalize operands besided srsrc. + // FIXME: We may need to legalize operands besides srsrc. return CreatedBB; } @@ -5836,7 +5958,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); - // Atomics rith return have have an additional tied operand and are + // Atomics with return have an additional tied operand and are // missing some of the special bits. MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; @@ -6050,7 +6172,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) .addReg(EXEC) .addReg(IsSCC ? VCC : CondReg); - Inst.RemoveOperand(1); + Inst.removeOperand(1); } break; @@ -6060,6 +6182,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, case AMDGPU::S_PACK_LL_B32_B16: case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HL_B32_B16: case AMDGPU::S_PACK_HH_B32_B16: movePackToVALU(Worklist, MRI, Inst); Inst.eraseFromParent(); @@ -6217,7 +6340,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); if (Op.isUse()) addSCCDefsToVALUWorklist(Op, Worklist); - Inst.RemoveOperand(i); + Inst.removeOperand(i); } } @@ -6247,7 +6370,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst.RemoveOperand(2); // Remove old immediate. + Inst.removeOperand(2); // Remove old immediate. Inst.addOperand(MachineOperand::CreateImm(Offset)); Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } @@ -6281,7 +6404,7 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, // these are deleted later, but at -O0 it would leave a suspicious // looking illegal copy of an undef register. for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.RemoveOperand(I); + Inst.removeOperand(I); Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); continue; } @@ -6323,7 +6446,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); - Inst.RemoveOperand(3); + Inst.removeOperand(3); Inst.setDesc(get(NewOpc)); Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit @@ -6467,7 +6590,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can // invert either source and then perform the XOR. If either source is a // scalar register, then we can leave the inversion on the scalar unit to - // acheive a better distrubution of scalar and vector instructions. + // achieve a better distribution of scalar and vector instructions. bool Src0IsSGPR = Src0.isReg() && RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); bool Src1IsSGPR = Src1.isReg() && @@ -6689,7 +6812,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, legalizeOperands(*LoHalf, MDT); legalizeOperands(*HiHalf, MDT); - // Move all users of this moved vlaue. + // Move all users of this moved value. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } @@ -6753,7 +6876,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, Worklist.insert(&LoHalf); Worklist.insert(&HiHalf); - // Move all users of this moved vlaue. + // Move all users of this moved value. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } @@ -6831,7 +6954,7 @@ void SIInstrInfo::splitScalar64BitBCNT( MRI.replaceRegWith(Dest.getReg(), ResultReg); - // We don't need to legalize operands here. src0 for etiher instruction can be + // We don't need to legalize operands here. src0 for either instruction can be // an SGPR, and the second input is unused or determined here. addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } @@ -6973,6 +7096,17 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, .add(Src1); break; } + case AMDGPU::S_PACK_HL_B32_B16: { + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Src0); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) + .add(Src1) + .addImm(16) + .addReg(TmpReg, RegState::Kill); + break; + } case AMDGPU::S_PACK_HH_B32_B16: { Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -7045,7 +7179,7 @@ void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); MachineInstr *SCCUseInst = Op.getParent(); - // Look for a preceeding instruction that either defines VCC or SCC. If VCC + // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be // converted to a VALU. @@ -7191,7 +7325,10 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { - return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | + int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11 ? + AMDGPU::UfmtGFX11::UFMT_32_FLOAT : + AMDGPU::UfmtGFX10::UFMT_32_FLOAT; + return (Format << 44) | (1ULL << 56) | // RESOURCE_LEVEL = 1 (3ULL << 60); // OOB_SELECT = 3 } @@ -7332,7 +7469,9 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return DescSize; bool HasLiteral = false; for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { - if (isLiteralConstant(MI, I)) { + const MachineOperand &Op = MI.getOperand(I); + const MCOperandInfo &OpInfo = Desc.OpInfo[I]; + if (isLiteralConstantLike(Op, OpInfo)) { HasLiteral = true; break; } @@ -7513,6 +7652,16 @@ SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } +ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> +SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { + static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = + { + {MONoClobber, "amdgpu-noclobber"}, + }; + + return makeArrayRef(TargetFlags); +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); @@ -7690,6 +7839,7 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, } // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +// and the columns of the getMCOpcodeGen table. enum SIEncodingFamily { SI = 0, VI = 1, @@ -7699,7 +7849,9 @@ enum SIEncodingFamily { GFX9 = 5, GFX10 = 6, SDWA10 = 7, - GFX90A = 8 + GFX90A = 8, + GFX940 = 9, + GFX11 = 10, }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -7714,6 +7866,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { return SIEncodingFamily::VI; case AMDGPUSubtarget::GFX10: return SIEncodingFamily::GFX10; + case AMDGPUSubtarget::GFX11: + return SIEncodingFamily::GFX11; } llvm_unreachable("Unknown subtarget generation!"); } @@ -7779,6 +7933,9 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { if (ST.hasGFX90AInsts()) { uint16_t NMCOp = (uint16_t)-1; + if (ST.hasGFX940Insts()) + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); + if (NMCOp == (uint16_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); if (NMCOp == (uint16_t)-1) NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); @@ -7925,7 +8082,7 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, auto &UseInst = *Use.getParent(); // Don't bother searching between blocks, although it is possible this block // doesn't modify exec. - if (UseInst.getParent() != DefBB) + if (UseInst.getParent() != DefBB || UseInst.isPHI()) return true; if (++NumUse > MaxUseScan) @@ -8150,7 +8307,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, - bool IsReversable, bool IsSigned) -> bool { + bool IsReversible, bool IsSigned) -> bool { // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n @@ -8208,7 +8365,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, bool IsReversedCC = false; if (CmpValue != ExpectedValue) { - if (!IsReversable) + if (!IsReversible) return false; IsReversedCC = CmpValue == (ExpectedValue ^ Mask); if (!IsReversedCC) @@ -8284,3 +8441,37 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; } + +void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI, + unsigned OpName) const { + if (!ST.needsAlignedVGPRs()) + return; + + int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + if (OpNo < 0) + return; + MachineOperand &Op = MI.getOperand(OpNo); + if (getOpSize(MI, OpNo) > 4) + return; + + // Add implicit aligned super-reg to force alignment on the data operand. + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *BB = MI.getParent(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + Register DataReg = Op.getReg(); + bool IsAGPR = RI.isAGPR(MRI, DataReg); + Register Undef = MRI.createVirtualRegister( + IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass + : &AMDGPU::VReg_64_Align2RegClass); + BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(DataReg, 0, Op.getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + Op.setReg(NewVR); + Op.setSubReg(AMDGPU::sub0); + MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e551d6c7223f..311f9f68e675 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H #include "AMDGPUMIRFormatter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SetVector.h" @@ -35,6 +36,11 @@ class RegScavenger; class TargetRegisterClass; class ScheduleHazardRecognizer; +/// Mark the MMO of a uniform load if there are no potentially clobbering stores +/// on any path from the start of an entry function to this load. +static const MachineMemOperand::Flags MONoClobber = + MachineMemOperand::MOTargetFlag1; + class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; @@ -323,15 +329,14 @@ public: Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override; - unsigned getAddressSpaceForPseudoSourceKind( - unsigned Kind) const override; - bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override; static bool isFoldableCopy(const MachineInstr &MI); + void removeModOperands(MachineInstr &MI) const; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final; @@ -549,6 +554,14 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::EXP; } + static bool isDualSourceBlendEXP(const MachineInstr &MI) { + if (!isEXP(MI)) + return false; + unsigned Target = MI.getOperand(0).getImm(); + return Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND0 || + Target == AMDGPU::Exp::ET_DUAL_SRC_BLEND1; + } + bool isEXP(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::EXP; } @@ -651,14 +664,43 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsMAI; } + static bool isMFMA(const MachineInstr &MI) { + return isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + } + static bool isDOT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::IsDOT; } + static bool isWMMA(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsWMMA; + } + + bool isWMMA(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::IsWMMA; + } + bool isDOT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + static bool isLDSDIR(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::LDSDIR; + } + + bool isLDSDIR(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::LDSDIR; + } + + static bool isVINTERP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VINTERP; + } + + bool isVINTERP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VINTERP; + } + static bool isScalarUnit(const MachineInstr &MI) { return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); } @@ -1036,6 +1078,9 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> + getSerializableMachineMemOperandTargetFlags() const override; + ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override; @@ -1132,6 +1177,11 @@ public: static unsigned getDSShaderTypeValue(const MachineFunction &MF); const TargetSchedModel &getSchedModel() const { return SchedModel; } + + // Enforce operand's \p OpName even alignment if required by target. + // This is used if an operand is a 32 bit register but needs to be aligned + // regardless. + void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; }; /// \brief Returns true if a reg:subreg pair P has a TRC class @@ -1210,9 +1260,6 @@ namespace AMDGPU { int getIfAddr64Inst(uint16_t Opcode); LLVM_READONLY - int getMUBUFNoLdsInst(uint16_t Opcode); - - LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); LLVM_READONLY @@ -1236,6 +1283,11 @@ namespace AMDGPU { LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode); + /// \returns SV (VADDR) form of a FLAT Scratch instruction given an \p Opcode + /// of an SVS (SADDR + VADDR) form. + LLVM_READONLY + int getFlatScratchInstSVfromSVS(uint16_t Opcode); + /// \returns SS (SADDR) form of a FLAT Scratch instruction given an \p Opcode /// of an SV (VADDR) form. LLVM_READONLY @@ -1250,6 +1302,10 @@ namespace AMDGPU { LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode); + /// \returns v_cmpx version of a v_cmp instruction. + LLVM_READONLY + int getVCMPXOpFromVCMP(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 713a08907e99..29ee9f12b12d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1,4 +1,4 @@ -//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// +//===-- SIInstrInfo.td -----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -17,7 +17,8 @@ class GCNPredicateControl : PredicateControl { } // Except for the NONE field, this must be kept in sync with the -// SIEncodingFamily enum in AMDGPUInstrInfo.cpp +// SIEncodingFamily enum in SIInstrInfo.cpp and the columns of the +// getMCOpcodeGen table. def SIEncodingFamily { int NONE = -1; int SI = 0; @@ -29,6 +30,8 @@ def SIEncodingFamily { int GFX10 = 6; int SDWA10 = 7; int GFX90A = 8; + int GFX940 = 9; + int GFX11 = 10; } //===----------------------------------------------------------------------===// @@ -190,6 +193,44 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; +multiclass SDBufferAtomicRetNoRet { + def "_ret" : PatFrag< + (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, + node:$offset, node:$cachepolicy, node:$idxen), + (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, + node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, + node:$idxen)> { + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; + let GISelPredicateCode = [{ return true; }]; + } + + def "_noret" : PatFrag< + (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, + node:$offset, node:$cachepolicy, node:$idxen), + (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, + node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, + node:$idxen)> { + let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; + let GISelPredicateCode = [{ return false; }]; + } +} + +defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet; + def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, [SDTCisVT<0, i32>, // dst @@ -205,6 +246,26 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +def SIbuffer_atomic_cmpswap_ret : PatFrag< + (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, + node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), + (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, + node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, + node:$idxen)> { + let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; + let GISelPredicateCode = [{ return true; }]; +} + +def SIbuffer_atomic_cmpswap_noret : PatFrag< + (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, + node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), + (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, + node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, + node:$idxen)> { + let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; + let GISelPredicateCode = [{ return false; }]; +} + class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, SDTypeProfile<0, 2, [SDTCisPtrTy<0>, // vaddr @@ -255,35 +316,57 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] >; +def SIfptrunc_round_upward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_UPWARD", + SDTFPRoundOp +>; + +def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD", + SDTFPRoundOp +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// // Returns 1 if the source arguments have modifiers, 0 if they do not. -// XXX - do f16 instructions? class isFloatType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, f16.Value), !eq(SrcVT.Value, f32.Value), !eq(SrcVT.Value, f64.Value), !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v8f16.Value), + !eq(SrcVT.Value, v16f16.Value), !eq(SrcVT.Value, v2f32.Value), + !eq(SrcVT.Value, v4f32.Value), + !eq(SrcVT.Value, v8f32.Value), !eq(SrcVT.Value, v2f64.Value), !eq(SrcVT.Value, v4f64.Value)); } +// XXX - do v2i16 instructions? class isIntType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, i16.Value), !eq(SrcVT.Value, i32.Value), !eq(SrcVT.Value, i64.Value), - !eq(SrcVT.Value, v2i32.Value)); + !eq(SrcVT.Value, v4i16.Value), + !eq(SrcVT.Value, v8i16.Value), + !eq(SrcVT.Value, v16i16.Value), + !eq(SrcVT.Value, v2i32.Value), + !eq(SrcVT.Value, v4i32.Value), + !eq(SrcVT.Value, v8i32.Value)); } class isPackedType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, v2i16.Value), !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v4f16.Value), - !eq(SrcVT.Value, v2f32.Value)); + !eq(SrcVT.Value, v2i32.Value), + !eq(SrcVT.Value, v2f32.Value), + !eq(SrcVT.Value, v4i32.Value), + !eq(SrcVT.Value, v4f32.Value), + !eq(SrcVT.Value, v8i32.Value), + !eq(SrcVT.Value, v8f32.Value)); } @@ -291,19 +374,10 @@ class isPackedType<ValueType SrcVT> { // PatFrags for global memory operations //===----------------------------------------------------------------------===// -foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { -let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { - - -defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>; -defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>; -defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>; -defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>; - - -} // End let AddressSpaces = ... -} // End foreach AddrSpace - +defm atomic_inc : binary_atomic_op_all_as<SIatomic_inc>; +defm atomic_dec : binary_atomic_op_all_as<SIatomic_dec>; +defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>; +defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -408,50 +482,36 @@ def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> { let IsNonExtLoad = 1; } -let MemoryVT = i8 in { def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>; def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>; def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>; -} -let MemoryVT = i16 in { def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>; def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>; def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>; -} +} // End IsLoad = 1, , AddressSpaces = LoadAddress_local.AddrSpaces def load_align8_local_m0 : PatFrag<(ops node:$ptr), - (load_local_m0 node:$ptr)>, Aligned<8> { + (load_local_m0 node:$ptr)> { let IsLoad = 1; - let IsNonExtLoad = 1; + int MinAlignment = 8; } def load_align16_local_m0 : PatFrag<(ops node:$ptr), - (load_local_m0 node:$ptr)>, Aligned<16> { + (load_local_m0 node:$ptr)> { let IsLoad = 1; - let IsNonExtLoad = 1; + int MinAlignment = 16; } -} // End IsLoad = 1 - let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in { def atomic_load_8_local_m0 : PatFrag<(ops node:$ptr), - (atomic_load_8_glue node:$ptr)> { - let MemoryVT = i8; -} + (atomic_load_8_glue node:$ptr)>; def atomic_load_16_local_m0 : PatFrag<(ops node:$ptr), - (atomic_load_16_glue node:$ptr)> { - let MemoryVT = i16; -} + (atomic_load_16_glue node:$ptr)>; def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr), - (atomic_load_32_glue node:$ptr)> { - let MemoryVT = i32; -} + (atomic_load_32_glue node:$ptr)>; def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr), - (atomic_load_64_glue node:$ptr)> { - let MemoryVT = i64; -} - + (atomic_load_64_glue node:$ptr)>; } // End let AddressSpaces = LoadAddress_local.AddrSpaces @@ -485,75 +545,103 @@ def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr), (truncstore_glue node:$val, node:$ptr)> { let IsStore = 1; let MemoryVT = i8; + let IsTruncStore = 1; } def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr), (truncstore_glue node:$val, node:$ptr)> { let IsStore = 1; let MemoryVT = i16; + let IsTruncStore = 1; } let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in { def store_local_m0 : PatFrag<(ops node:$val, node:$ptr), - (store_glue node:$val, node:$ptr)> { - let IsStore = 1; - let IsTruncStore = 0; -} - + (store_glue node:$val, node:$ptr)>; def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr)> { - let IsStore = 1; - let MemoryVT = i8; -} - + (truncstorei8_glue node:$val, node:$ptr)>; def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr), - (unindexedstore_glue node:$val, node:$ptr)> { - let IsStore = 1; - let MemoryVT = i16; -} + (truncstorei16_glue node:$val, node:$ptr)>; } def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr), (store_local_m0 node:$value, node:$ptr)>, Aligned<8> { let IsStore = 1; - let IsTruncStore = 0; } def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr), (store_local_m0 node:$value, node:$ptr)>, Aligned<16> { let IsStore = 1; +} + +let PredicateCode = [{return cast<MemSDNode>(N)->getAlignment() < 4;}], + GISelPredicateCode = [{return (*MI.memoperands_begin())->getAlign() < 4;}], + AddressSpaces = [ AddrSpaces.Local ] in { +def load_align_less_than_4_local : PatFrag<(ops node:$ptr), + (load_local node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} + +def load_align_less_than_4_local_m0 : PatFrag<(ops node:$ptr), + (load_local_m0 node:$ptr)> { + let IsLoad = 1; + let IsNonExtLoad = 1; +} + +def store_align_less_than_4_local : PatFrag <(ops node:$value, node:$ptr), + (store_local node:$value, node:$ptr)> { + let IsStore = 1; let IsTruncStore = 0; } -let AddressSpaces = StoreAddress_local.AddrSpaces in { +def store_align_less_than_4_local_m0 : PatFrag <(ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} +} -def atomic_store_local_8_m0 : PatFrag < - (ops node:$value, node:$ptr), - (AMDGPUatomic_st_glue node:$value, node:$ptr)> { +def atomic_store_8_glue : PatFrag < + (ops node:$ptr, node:$value), + (AMDGPUatomic_st_glue node:$ptr, node:$value)> { let IsAtomic = 1; let MemoryVT = i8; } -def atomic_store_local_16_m0 : PatFrag < - (ops node:$value, node:$ptr), - (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + +def atomic_store_16_glue : PatFrag < + (ops node:$ptr, node:$value), + (AMDGPUatomic_st_glue node:$ptr, node:$value)> { let IsAtomic = 1; let MemoryVT = i16; } -def atomic_store_local_32_m0 : PatFrag < - (ops node:$value, node:$ptr), - (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + +def atomic_store_32_glue : PatFrag < + (ops node:$ptr, node:$value), + (AMDGPUatomic_st_glue node:$ptr, node:$value)> { let IsAtomic = 1; let MemoryVT = i32; } -def atomic_store_local_64_m0 : PatFrag < - (ops node:$value, node:$ptr), - (AMDGPUatomic_st_glue node:$value, node:$ptr)> { + +def atomic_store_64_glue : PatFrag < + (ops node:$ptr, node:$value), + (AMDGPUatomic_st_glue node:$ptr, node:$value)> { let IsAtomic = 1; let MemoryVT = i64; } -} // End let AddressSpaces = StoreAddress_local.AddrSpaces + +let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces in { +def atomic_store_8_local_m0 : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_8_glue node:$ptr, node:$val)>; +def atomic_store_16_local_m0 : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_16_glue node:$ptr, node:$val)>; +def atomic_store_32_local_m0 : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_32_glue node:$ptr, node:$val)>; +def atomic_store_64_local_m0 : PatFrag<(ops node:$ptr, node:$val), + (atomic_store_64_glue node:$ptr, node:$val)>; +} // End let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces def si_setcc_uniform : PatFrag < @@ -686,10 +774,14 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, let AddressSpaces = StoreAddress_local.AddrSpaces in { defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + IsInt>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; + defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + IsInt>; } } @@ -954,6 +1046,18 @@ def SWaitMatchClass : AsmOperandClass { let ParserMethod = "parseSWaitCntOps"; } +def DepCtrMatchClass : AsmOperandClass { + let Name = "DepCtr"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseDepCtrOps"; +} + +def SDelayMatchClass : AsmOperandClass { + let Name = "SDelayAlu"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSDelayAluOps"; +} + def VReg32OrOffClass : AsmOperandClass { let Name = "VReg32OrOff"; let ParserMethod = "parseVReg32OrOff"; @@ -979,6 +1083,16 @@ def WAIT_FLAG : Operand <i32> { let ParserMatchClass = SWaitMatchClass; let PrintMethod = "printWaitFlag"; } + +def DepCtrImm : Operand <i32> { + let ParserMatchClass = DepCtrMatchClass; + let PrintMethod = "printDepCtr"; +} + +def DELAY_FLAG : Operand <i32> { + let ParserMatchClass = SDelayMatchClass; + let PrintMethod = "printDelayFlag"; +} } // End OperandType = "OPERAND_IMMEDIATE" include "SIInstrFormats.td" @@ -1163,14 +1277,6 @@ def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT", 0>>; def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def Dim : NamedOperandU8<"Dim", NamedMatchClass<"Dim", 0>>; -def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; - -def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; -def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; -def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; -def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; -def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; - def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; @@ -1181,6 +1287,14 @@ def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; +def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; + +def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; +def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; +def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; +def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; + def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>; @@ -1191,6 +1305,9 @@ def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { } +def wait_vdst : NamedOperandU8<"WaitVDST", NamedMatchClass<"WaitVDST">>; +def wait_exp : NamedOperandU8<"WaitEXP", NamedMatchClass<"WaitEXP">>; + } // End OperandType = "OPERAND_IMMEDIATE" class KImmMatchClass<int size> : AsmOperandClass { @@ -1223,10 +1340,18 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass { let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } +class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> { + let Name = "RegOrInlineImmWithFP"#opSize#"InputMods"; + let PredicateMethod = "isRegOrInlineImmWithFP"#opSize#"InputMods"; +} + def FP16InputModsMatchClass : FPInputModsMatchClass<16>; def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; +def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>; +def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>; + class InputMods <AsmOperandClass matchClass> : Operand <i32> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_INPUT_MODS"; @@ -1241,19 +1366,28 @@ def FP16InputMods : FPInputMods<FP16InputModsMatchClass>; def FP32InputMods : FPInputMods<FP32InputModsMatchClass>; def FP64InputMods : FPInputMods<FP64InputModsMatchClass>; +def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>; +def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>; + class IntInputModsMatchClass <int opSize> : AsmOperandClass { let Name = "RegOrImmWithInt"#opSize#"InputMods"; let ParserMethod = "parseRegOrImmWithIntInputMods"; let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods"; } +class IntVCSrcInputModsMatchClass <int opSize> : IntInputModsMatchClass <opSize> { + let Name = "RegOrInlineImmWithInt"#opSize#"InputMods"; + let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods"; +} def Int32InputModsMatchClass : IntInputModsMatchClass<32>; def Int64InputModsMatchClass : IntInputModsMatchClass<64>; +def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>; class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> { let PrintMethod = "printOperandAndIntInputMods"; } def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>; class OpSelModsMatchClass : AsmOperandClass { let Name = "OpSelMods"; @@ -1366,12 +1500,19 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; +def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; +def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">; +def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; + def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; +def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; + //===----------------------------------------------------------------------===// // SI assembler operands //===----------------------------------------------------------------------===// @@ -1575,6 +1716,19 @@ class getVOP3SrcForVT<ValueType VT> { ); } +// Src2 of VOP3 DPP instructions cannot be a literal +class getVOP3DPPSrcForVT<ValueType VT> { + bit isFP = isFloatType<VT>.ret; + RegisterOperand ret = + !if (!eq(VT.Value, i1.Value), SSrc_i1, + !if (isFP, + !if (!eq(VT.Value, f16.Value), VCSrc_f16, + !if (!eq(VT.Value, v2f16.Value), VCSrc_v2f16, VCSrc_f32)), + !if (!eq(VT.Value, i16.Value), VCSrc_b16, + !if (!eq(VT.Value, v2i16.Value), VCSrc_v2b16, + VCSrc_b32)))); +} + // Float or packed int class isModifierType<ValueType SrcVT> { bit ret = !or(!eq(SrcVT.Value, f16.Value), @@ -1583,7 +1737,17 @@ class isModifierType<ValueType SrcVT> { !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v2i16.Value), !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v2i32.Value)); + !eq(SrcVT.Value, v2i32.Value), + !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v4i16.Value), + !eq(SrcVT.Value, v4f32.Value), + !eq(SrcVT.Value, v4i32.Value), + !eq(SrcVT.Value, v8f16.Value), + !eq(SrcVT.Value, v8i16.Value), + !eq(SrcVT.Value, v8f32.Value), + !eq(SrcVT.Value, v8i32.Value), + !eq(SrcVT.Value, v16f16.Value), + !eq(SrcVT.Value, v16i16.Value)); } // Return type of input modifiers operand for specified input operand @@ -1611,6 +1775,17 @@ class getSrcModDPP <ValueType VT> { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } +// Return type of input modifiers operand for specified input operand for DPP +class getSrcModVOP3DPP <ValueType VT, bit EnableF32SrcMods> { + bit isFP = isFloatType<VT>.ret; + bit isPacked = isPackedType<VT>.ret; + Operand ret = + !if (isFP, + !if (!eq(VT.Value, f16.Value), FP16VCSrcInputMods, + FP32VCSrcInputMods), + !if (EnableF32SrcMods, FP32VCSrcInputMods, Int32VCSrcInputMods)); +} + // Return type of input modifiers operand specified input operand for SDWA class getSrcModSDWA <ValueType VT> { Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods, @@ -1620,7 +1795,7 @@ class getSrcModSDWA <ValueType VT> { } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. -class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { +class getIns32 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 (ins))); @@ -1715,19 +1890,21 @@ class getInsVOP3Base<RegisterOperand Src0RC, RegisterOperand Src1RC, HasClamp, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; dag opsel = (ins op_sel0:$op_sel); - dag vop3pFields = (ins op_sel_hi0:$op_sel_hi, neg_lo0:$neg_lo, neg_hi0:$neg_hi); + dag vop3pOpsel = (ins op_sel_hi0:$op_sel_hi); + dag vop3pFields = !con(!if(HasOpSel, vop3pOpsel, (ins)), (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi)); + dag ret = !con(base, !if(HasOpSel, opsel,(ins)), !if(IsVOP3P, vop3pFields,(ins))); } class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, - RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp, + RegisterOperand Src2RC, int NumSrcArgs, bit HasClamp, bit HasOpSel, Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = getInsVOP3Base<Src0RC, Src1RC, Src2RC, NumSrcArgs, HasClamp, 1/*HasModifiers*/, 1/*HasSrc2Mods*/, 0/*HasOMod*/, Src0Mod, Src1Mod, Src2Mod, - 1/*HasOpSel*/, 1/*IsVOP3P*/>.ret; + HasOpSel, 1/*IsVOP3P*/>.ret; } class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC, @@ -1741,8 +1918,8 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC, } class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod> { + RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = !if (!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1756,6 +1933,7 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass // VOP1_DPP without modifiers (ins OldRC:$old, Src0RC:$src0) /* endif */), + !if (!eq(NumSrcArgs, 2), !if (HasModifiers, // VOP2_DPP with modifiers (ins OldRC:$old, @@ -1765,34 +1943,72 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass // VOP2_DPP without modifiers (ins OldRC:$old, Src0RC:$src0, Src1RC:$src1) - ))); + ) + /* NumSrcArgs == 3, VOP3 */, + !if (HasModifiers, + // VOP3_DPP with modifiers + (ins OldRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2) + /* else */, + // VOP3_DPP without modifiers + (ins OldRC:$old, + Src0RC:$src0, Src1RC:$src1, + Src2RC:$src2) + ) + /* endif */))); } class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod>.ret, + RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, + HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, - bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); } class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod>.ret, + RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, + HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, (ins FI:$fi)); } class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC, - int NumSrcArgs, bit HasModifiers, - Operand Src0Mod, Operand Src1Mod> { - dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, NumSrcArgs, - HasModifiers, Src0Mod, Src1Mod>.ret, + RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { + dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs, + HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret, (ins dpp8:$dpp8, FI:$fi)); } +class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { + dag old = ( ins OldRC:$old ); + dag base = VOP3Base; + dag ret = !con( + !if(!ne(NumSrcArgs, 0), old, (ins)), + base + ); +} + +class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { + dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret, + (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)); +} + +class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { + dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret, + (ins FI:$fi)); +} + +class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> { + dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret, + (ins dpp8:$dpp8, FI:$fi)); +} // Ins for SDWA class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, @@ -1870,6 +2086,15 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { !if(!eq(NumSrcArgs, 3), src0#src1#src2, ""); } +class getAsmVOPDPart <int NumSrcArgs, string XorY> { + string dst = "$vdst" # XorY; + string src0 = ", $src0" # XorY; + string src1 = ", $vsrc1" # XorY; + string ret = dst # + !if(!ge(NumSrcArgs, 1), src0, "") # + !if(!ge(NumSrcArgs, 2), src1, ""); +} + // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers, @@ -1890,7 +2115,7 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers, // Returns the assembly string for the inputs and outputs of a VOP3P // instruction. class getAsmVOP3P <int NumSrcArgs, bit HasModifiers, - bit HasClamp> { + bit HasClamp, bit HasOpSel> { string dst = "$vdst"; string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); string src1 = !if(!eq(NumSrcArgs, 1), "", @@ -1900,10 +2125,11 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers, string mods = !if(HasModifiers, "$neg_lo$neg_hi", ""); string clamp = !if(HasClamp, "$clamp", ""); + string opsel = !if(HasOpSel, "$op_sel$op_sel_hi", ""); // Each modifier is printed as an array of bits for each operand, so // all operands are printed as part of src0_modifiers. - string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp; + string ret = dst#", "#src0#src1#src2#opsel#mods#clamp; } class getAsmVOP3OpSel <int NumSrcArgs, @@ -1930,8 +2156,8 @@ class getAsmVOP3OpSel <int NumSrcArgs, string src2 = !if(Src2HasMods, fsrc2, isrc2); string clamp = !if(HasClamp, "$clamp", ""); - - string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp; + string omod = ""; + string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp#omod; } class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { @@ -1955,15 +2181,63 @@ class getAsmDPP16 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT } class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> - : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT> { + : getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>{ let ret = dst#args#" $dpp8$fi"; } +class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp, + bit HasOpSel, bit HasOMod, bit IsVOP3P, + bit HasModifiers, bit Src0HasMods, + bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string isrc1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string fsrc1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + + string src0 = !if(Src0HasMods, fsrc0, isrc0); + string src1 = !if(Src1HasMods, fsrc1, isrc1); + string src2 = !if(Src2HasMods, fsrc2, isrc2); + string opsel = !if(HasOpSel, "$op_sel", ""); + string 3PMods = !if(IsVOP3P, + !if(HasOpSel, "$op_sel_hi", "") + #!if(HasModifiers, "$neg_lo$neg_hi", ""), + ""); + string clamp = !if(HasClamp, "$clamp", ""); + string omod = !if(HasOMod, "$omod", ""); + + string ret = dst#", "#src0#src1#src2#opsel#3PMods#clamp#omod; + +} + +class getAsmVOP3DPP<string base> { + string ret = base # " $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; +} + +class getAsmVOP3DPP16<string base> { + string ret = getAsmVOP3DPP<base>.ret # "$fi"; +} + +class getAsmVOP3DPP8<string base> { + string ret = base # " $dpp8$fi"; +} + class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), - " vcc", // use vcc token as dst for VOPC instructioins + " vcc", // use vcc token as dst for VOPC instructions "$vdst"), ""); string src0 = "$src0_modifiers"; @@ -2056,6 +2330,12 @@ class getHasDPP <int NumSrcArgs> { 1); } +class getHasExt32BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !and(getHasDPP<NumSrcArgs>.ret, + !not(getHas64BitOps<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret)); +} + class getHasExt64BitDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32> { bit ret = !and(getHasDPP<NumSrcArgs>.ret, @@ -2089,6 +2369,24 @@ class BitAnd<bit a, bit b> { bit ret = !if(a, !if(b, 1, 0), 0); } +class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32, ValueType Src2VT = i32> { + bit ret = !if(!eq(DstVT.Size, 64), + 0, // 64-bit dst No DPP for 64-bit operands + !if(!eq(Src0VT.Size, 64), + 0, // 64-bit src0 + !if(!eq(Src1VT.Size, 64), + 0, // 64-bit src1 + !if(!eq(Src2VT.Size, 64), + 0, // 64-bit src2 + 1 + ) + ) + ) + ); +} + + def PatGenMode { int NoPattern = 0; int Pattern = 1; @@ -2106,15 +2404,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRC64 = DstRC; field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; - field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; + field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; + field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret; + field RegisterOperand Src0VOP3DPP = VGPRSrc_32; + field RegisterOperand Src1VOP3DPP = VGPRSrc_32; + field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; field Operand Src0Mod = getSrcMod<Src0VT, EnableF32SrcMods>.ret; @@ -2122,6 +2425,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field Operand Src2Mod = getSrcMod<Src2VT, EnableF32SrcMods>.ret; field Operand Src0ModDPP = getSrcModDPP<Src0VT>.ret; field Operand Src1ModDPP = getSrcModDPP<Src1VT>.ret; + field Operand Src2ModDPP = getSrcModDPP<Src2VT>.ret; + field Operand Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, EnableF32SrcMods>.ret; field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret; field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret; @@ -2169,15 +2474,20 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, field bit HasSrc2Mods = !if(HasModifiers, !or(HasSrc2FloatMods, HasSrc2IntMods), 0); field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; - field bit HasExtDPP = getHasDPP<NumSrcArgs>.ret; + field bit HasExtVOP3DPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret; + field bit HasExtDPP = !if(!or(getHasDPP<NumSrcArgs>.ret, + HasExtVOP3DPP), 1, 0); + field bit HasExt32BitDPP = getHasExt32BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtSDWA9 = HasExtSDWA; field int NeedPatGen = PatGenMode.NoPattern; field bit IsMAI = 0; + field bit IsVOP3P = 0; field bit IsDOT = 0; field bit IsSingle = 0; + field bit IsWMMA = 0; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -2188,9 +2498,11 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, // VOP3b instructions are a special case with a second explicit // output. This is manually overridden for them. field dag Outs32 = Outs; - field dag Outs64 = Outs; + field dag Outs64 = !if(HasDst,(outs DstRC64:$vdst),(outs)); field dag OutsDPP = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret; field dag OutsDPP8 = getOutsDPP<HasDst, DstVT, DstRCDPP>.ret; + field dag OutsVOP3DPP = OutsDPP; + field dag OutsVOP3DPP8 = OutsDPP8; field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; @@ -2198,7 +2510,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, HasIntClamp, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64, - NumSrcArgs, HasClamp, + NumSrcArgs, HasClamp, HasOpSel, Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret; field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasClamp, HasOMod, @@ -2206,21 +2518,35 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, getOpSelMod<Src1VT>.ret, getOpSelMod<Src2VT>.ret>.ret; field dag InsDPP = !if(HasExtDPP, - getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, - HasModifiers, Src0ModDPP, Src1ModDPP>.ret, + getInsDPP<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret, (ins)); - field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, - HasModifiers, Src0ModDPP, Src1ModDPP>.ret; - field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, 0, - Src0ModDPP, Src1ModDPP>.ret; + field dag InsDPP16 = getInsDPP16<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret; + field dag InsDPP8 = getInsDPP8<DstRCDPP, Src0DPP, Src1DPP, Src2DPP, + NumSrcArgs, HasModifiers, + Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret; + field dag InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, + Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod, + Src0ModDPP, Src1ModDPP, Src2ModVOP3DPP, HasOpSel, IsVOP3P>.ret; + field dag InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret; + field dag InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret; + field dag InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, DstRCDPP, NumSrcArgs>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; + field dag InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X); + // It is a slight misnomer to use the deferred f32 operand type for non-float + // operands, but this operand type will only be used if the other dual + // component is FMAAK or FMAMK + field dag InsVOPDXDeferred = (ins !if(!eq(Src0VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0X, VGPR_32:$vsrc1X); + field dag InsVOPDY = (ins Src0RC32:$src0Y, Src1RC32:$vsrc1Y); + field dag InsVOPDYDeferred = (ins !if(!eq(Src1VT.Size, 32), VSrc_f32_Deferred, VSrc_f16_Deferred):$src0Y, VGPR_32:$vsrc1Y); field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret; - field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp>.ret; + field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret; field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, HasClamp, HasSrc0FloatMods, @@ -2232,15 +2558,24 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, // DPP8 encoding has no fields for modifiers, and it is enforced by setting // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret; + field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp, + HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods, + HasSrc2FloatMods, DstVT >.ret; + field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret; + field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret; + field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret; field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; - + field string AsmVOPDX = getAsmVOPDPart<NumSrcArgs, "X">.ret; + field string AsmVOPDY = getAsmVOPDPart<NumSrcArgs, "Y">.ret; field string TieRegDPP = "$old"; } -class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { + class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; let HasExtDPP = 0; + let HasExtVOP3DPP = 0; + let HasExt32BitDPP = 0; let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; @@ -2249,10 +2584,10 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.ArgVT> { let NeedPatGen = mode; } - def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; +def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; @@ -2264,6 +2599,7 @@ def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; +def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; @@ -2274,6 +2610,10 @@ def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>; def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; +def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>; +def VOP_I16_V2I16_V2I16_I16 : VOPProfile <[i16, v2i16, v2i16, i16]>; +def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>; + def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>; def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; @@ -2343,6 +2683,18 @@ def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>; def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>; def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>; +def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>; +def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>; +def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>; +def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>; + +def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>; +def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>; +def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>; +def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>; +def VOP_V4I32_V2I32_V4I32_I32 : VOPProfile <[v4i32, v2i32, v4i32, i32]>; +def VOP_V16I32_V2I32_V4I32_I32 : VOPProfile <[v16i32, v2i32, v4i32, i32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; @@ -2394,10 +2746,11 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>; - let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { + let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { def _gfx10 : VINTRP_Real_si<op, NAME, outs, ins, asm, SIEncodingFamily.GFX10>; - } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" + } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" } + //===----------------------------------------------------------------------===// // Vector instruction mappings //===----------------------------------------------------------------------===// @@ -2470,6 +2823,7 @@ def getMCOpcodeGen : InstrMapping { let RowFields = ["PseudoInstr"]; let ColFields = ["Subtarget"]; let KeyCol = [!cast<string>(SIEncodingFamily.NONE)]; + // These columns must be kept in sync with the SIEncodingFamily enumeration. let ValueCols = [[!cast<string>(SIEncodingFamily.SI)], [!cast<string>(SIEncodingFamily.VI)], [!cast<string>(SIEncodingFamily.SDWA)], @@ -2482,7 +2836,9 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.GFX9)], [!cast<string>(SIEncodingFamily.GFX10)], [!cast<string>(SIEncodingFamily.SDWA10)], - [!cast<string>(SIEncodingFamily.GFX90A)]]; + [!cast<string>(SIEncodingFamily.GFX90A)], + [!cast<string>(SIEncodingFamily.GFX940)], + [!cast<string>(SIEncodingFamily.GFX11)]]; } // Get equivalent SOPK instruction. @@ -2510,14 +2866,6 @@ def getIfAddr64Inst : InstrMapping { let ValueCols = [["1"]]; } -def getMUBUFNoLdsInst : InstrMapping { - let FilterClass = "MUBUFLdsTable"; - let RowFields = ["OpName"]; - let ColFields = ["IsLds"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - // Maps an atomic opcode to its returnless version. def getAtomicNoRetOp : InstrMapping { let FilterClass = "AtomicNoRet"; @@ -2580,6 +2928,14 @@ def getFlatScratchInstSSfromSV : InstrMapping { let ValueCols = [["SS"]]; } +def getFlatScratchInstSVfromSVS : InstrMapping { + let FilterClass = "FlatScratchInst"; + let RowFields = ["SVOp"]; + let ColFields = ["Mode"]; + let KeyCol = ["SVS"]; + let ValueCols = [["SV"]]; +} + def getFlatScratchInstSVfromSS : InstrMapping { let FilterClass = "FlatScratchInst"; let RowFields = ["SVOp"]; @@ -2596,6 +2952,15 @@ def getMFMAEarlyClobberOp : InstrMapping { let ValueCols = [["0"]]; } +// Maps an v_cmp instruction to its v_cmpx equivalent. +def getVCMPXOpFromVCMP : InstrMapping { + let FilterClass = "VCMPVCMPXTable"; + let RowFields = ["VCMPOp"]; + let ColFields = ["IsVCMPX"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 7be63ae6964b..829669157893 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -14,12 +14,24 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro } +class UniformSextInreg<ValueType VT> : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return !N->isDivergent(); }]>; + +class DivergentSextInreg<ValueType VT> : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return N->isDivergent(); }]>; + include "SOPInstructions.td" include "VOPInstructions.td" include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" include "EXPInstructions.td" +include "LDSDIRInstructions.td" +include "VINTERPInstructions.td" //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -176,19 +188,33 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let mayStore = 0; } +// Pseudo instructions used for @llvm.fptrunc.round upward +// and @llvm.fptrunc.round downward. +// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD +// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to +// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO. +// The final codegen is done in the ModeRegister pass. +let Uses = [MODE, EXEC] in { +def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>; + +def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32:$src0), + [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>; +} // End Uses = [MODE, EXEC] + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VGPR_32: $src, VSrc_b32:$inactive), + (ins VSrc_b32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { - let Constraints = "$src = $vdst"; } def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VReg_64: $src, VSrc_b64:$inactive), + (ins VSrc_b64: $src, VSrc_b64:$inactive), [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { - let Constraints = "$src = $vdst"; } } // End Defs = [SCC] @@ -287,6 +313,20 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), let isConvergent = 1; let FixedSize = 1; let Size = 0; + let isMeta = 1; +} + +def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), + [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; + let isMeta = 1; } // SI pseudo instructions. These are used by the CFG structurizer pass @@ -424,6 +464,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), let Size = 0; let hasNoSchedulingInfo = 1; let FixedSize = 1; + let isMeta = 1; } // Used as an isel pseudo to directly emit initialization with an @@ -459,11 +500,14 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI < let hasNoSchedulingInfo = 1; let DisableWQM = 1; let FixedSize = 1; + + // TODO: Should this be true? + let isMeta = 0; } // Return for returning function calls. def SI_RETURN : SPseudoInstSI < - (outs), (ins), [], + (outs), (ins), [(AMDGPUret_flag)], "; return"> { let isTerminator = 1; let isBarrier = 1; @@ -496,6 +540,7 @@ def : GCNPat< def SI_CALL : SPseudoInstSI < (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { let Size = 4; + let FixedSize = 1; let isCall = 1; let UseNamedOperandTable = 1; let SchedRW = [WriteBranch]; @@ -508,6 +553,7 @@ def SI_TCRETURN : SPseudoInstSI <(outs), (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff), [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; + let FixedSize = 1; let isCall = 1; let isTerminator = 1; let isReturn = 1; @@ -1212,6 +1258,26 @@ def : Pat < (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) >; +def : Pat < + (extract_subvector v16i16:$vec, (i32 0)), + (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3)) +>; + +def : Pat < + (extract_subvector v16i16:$vec, (i32 8)), + (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7)) +>; + +def : Pat < + (extract_subvector v16f16:$vec, (i32 0)), + (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3)) +>; + +def : Pat < + (extract_subvector v16f16:$vec, (i32 8)), + (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7)) +>; + foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) @@ -1371,7 +1437,18 @@ def : BitConvert <v8i32, v4i64, VReg_256>; def : BitConvert <v8f32, v4i64, VReg_256>; def : BitConvert <v8i32, v4f64, VReg_256>; def : BitConvert <v8f32, v4f64, VReg_256>; - +def : BitConvert <v16i16, v16f16, SReg_256>; +def : BitConvert <v16f16, v16i16, SReg_256>; +def : BitConvert <v16i16, v16f16, VReg_256>; +def : BitConvert <v16f16, v16i16, VReg_256>; +def : BitConvert <v16f16, v8i32, VReg_256>; +def : BitConvert <v16i16, v8i32, VReg_256>; +def : BitConvert <v16f16, v8f32, VReg_256>; +def : BitConvert <v16i16, v8f32, VReg_256>; +def : BitConvert <v8i32, v16f16, VReg_256>; +def : BitConvert <v8i32, v16i16, VReg_256>; +def : BitConvert <v8f32, v16f16, VReg_256>; +def : BitConvert <v8f32, v16i16, VReg_256>; // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; @@ -1941,12 +2018,6 @@ def : GCNPat < //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// - -class UniformSextInreg<ValueType VT> : PatFrag< - (ops node:$src), - (sext_inreg $src, VT), - [{ return !N->isDivergent(); }]>; - def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 @@ -1981,23 +2052,28 @@ def : GCNPat < (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; - -class DivergentSextInreg<ValueType VT> : PatFrag< - (ops node:$src), - (sext_inreg $src, VT), - [{ return N->isDivergent(); }]>; - -def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)), +def : GCNPat< + (i32 (DivergentSextInreg<i1> i32:$src)), (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; def : GCNPat < (i16 (DivergentSextInreg<i1> i16:$src)), - (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16 + (V_BFE_I32_e64 $src, (i32 0), (i32 1)) >; def : GCNPat < (i16 (DivergentSextInreg<i8> i16:$src)), - (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16 + (V_BFE_I32_e64 $src, (i32 0), (i32 8)) +>; + +def : GCNPat< + (i32 (DivergentSextInreg<i8> i32:$src)), + (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) +>; + +def : GCNPat < + (i32 (DivergentSextInreg<i16> i32:$src)), + (V_BFE_I32_e64 $src, (i32 0), (i32 16)) >; def : GCNPat < @@ -2010,14 +2086,14 @@ def : GCNPat < def : GCNPat < (i64 (DivergentSextInreg<i8> i64:$src)), (REG_SEQUENCE VReg_64, - (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0, + (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0, (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) >; def : GCNPat < (i64 (DivergentSextInreg<i16> i64:$src)), (REG_SEQUENCE VReg_64, - (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0, + (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0, (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) >; @@ -2053,12 +2129,18 @@ def : ZExt_i64_i1_Pat<anyext>; // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple outputs. def : GCNPat < - (i64 (sext i32:$src)), + (i64 (UniformUnaryFrag<sext> i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; def : GCNPat < + (i64 (DivergentUnaryFrag<sext> i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, + (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) +>; + +def : GCNPat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), @@ -2235,6 +2317,30 @@ def : GCNPat < // the src is lowered. e.g. fptrunc + fma may be lowered to a // v_fma_mix* instruction which does not zero, or may not. def : GCNPat< + (i32 (DivergentUnaryFrag<abs> i32:$src)), + (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; + +let AddedComplexity = 1 in { +def : GCNPat< + (i32 (DivergentUnaryFrag<abs> i32:$src)), + (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ + let SubtargetPredicate = HasAddNoCarryInsts; +} +} // AddedComplexity = 1 + +def : GCNPat< + (i32 (DivergentUnaryFrag<zext> i16:$src)), + (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag<zext> i16:$src)), + (REG_SEQUENCE VReg_64, + (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src)>; @@ -2269,6 +2375,34 @@ def : GCNPat < (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; +def IMMBitSelConst : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Matching separate SRL and TRUNC instructions +// with dependent operands (SRL dest is source of TRUNC) +// generates three instructions. However, by using bit shifts, +// the V_LSHRREV_B32_e64 result can be directly used in the +// operand of the V_AND_B32_e64 instruction: +// (trunc i32 (srl i32 $a, i32 $b)) -> +// v_and_b32_e64 $a, (1 << $b), $a +// v_cmp_ne_u32_e64 $a, 0, $a + +// Handle the VALU case. +def : GCNPat < + (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), + (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), + (i32 0)) +>; + +// Handle the scalar case. +def : GCNPat < + (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), + (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), + (i32 0)) +>; + def : GCNPat < (i1 (DivergentUnaryFrag<trunc> i64:$a)), (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), @@ -2350,6 +2484,11 @@ def : GCNPat < } +def : GCNPat< + (i64 (DivergentUnaryFrag<bitreverse> i64:$a)), + (REG_SEQUENCE VReg_64, + (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, + (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; // Prefer selecting to max when legal, but using mul is always valid. let AddedComplexity = -5 in { @@ -2508,12 +2647,12 @@ def : GCNPat < >; def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))), (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) >; def : GCNPat < - (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))), + (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))), (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) >; @@ -2597,6 +2736,15 @@ def : GCNPat < >; } // End SubtargetPredicate = HasVOP3PInsts +// With multiple uses of the shift, this will duplicate the shift and +// increase register pressure. +let SubtargetPredicate = isGFX11Plus in +def : GCNPat < + (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))), + (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) +>; + + def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) @@ -2678,18 +2826,18 @@ def : GCNPat < // an inline immediate than -c. // TODO: Also do for 64-bit. def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) >; def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { let SubtargetPredicate = HasAddNoCarryInsts; } def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { let SubtargetPredicate = NotHasAddNoCarryInsts; } @@ -2703,20 +2851,21 @@ def : GCNPat< (S_MOV_B32 SReg_32:$src) >; -multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { +multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> { def : GCNPat < - (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), (BFM $a, $b) >; def : GCNPat < - (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV (i32 0))) + (vt (ADD (vt (shl 1, vt:$a)), -1)), + (BFM $a, (i32 0)) >; } -defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; -// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; +defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>; +// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>; +defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>; // Bitfield extract patterns @@ -3007,6 +3156,19 @@ def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { let hasSideEffects = 0; } +// Integer multiply-add: arg0 * arg1 + arg2. +// +// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned), +// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out. +class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst, type1:$carry_out); + let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2); + let hasSideEffects = 0; +} + +def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32; +def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32; + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. @@ -3130,3 +3292,15 @@ def G_SI_CALL : AMDGPUGenericInstruction { // TODO: Should really base this on the call target let isConvergent = 1; } + +def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} + +def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$vdst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 4fa8ec711134..47095ae22027 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -72,16 +72,22 @@ static void generateEndPgm(MachineBasicBlock &MBB, bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS; // Check if hardware has been configured to expect color or depth exports. - bool HasExports = - AMDGPU::getHasColorExport(F) || AMDGPU::getHasDepthExport(F); + bool HasColorExports = AMDGPU::getHasColorExport(F); + bool HasDepthExports = AMDGPU::getHasDepthExport(F); + bool HasExports = HasColorExports || HasDepthExports; // Prior to GFX10, hardware always expects at least one export for PS. bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget()); if (IsPS && (HasExports || MustExport)) { // Generate "null export" if hardware is expecting PS to export. + const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); + int Target = + ST.hasNullExportTarget() + ? AMDGPU::Exp::ET_NULL + : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ); BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(AMDGPU::Exp::ET_NULL) + .addImm(Target) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 44bdbe37dec0..6d4e1d2c898b 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,6 +79,13 @@ enum InstClassEnum { MIMG, TBUFFER_LOAD, TBUFFER_STORE, + GLOBAL_LOAD_SADDR, + GLOBAL_STORE_SADDR, + FLAT_LOAD, + FLAT_STORE, + GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of + GLOBAL_STORE // any CombineInfo, they are only ever returned by + // getCommonInstClass. }; struct AddressRegs { @@ -86,6 +93,7 @@ struct AddressRegs { bool SBase = false; bool SRsrc = false; bool SOffset = false; + bool SAddr = false; bool VAddr = false; bool Addr = false; bool SSamp = false; @@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { } void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); + + // Compare by pointer order. + bool operator<(const CombineInfo& Other) const { + return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; + } }; struct BaseRegisters { @@ -185,6 +198,9 @@ private: AliasAnalysis *AA = nullptr; bool OptimizeAgain; + bool canSwapInstructions(const DenseSet<Register> &ARegDefs, + const DenseSet<Register> &ARegUses, + const MachineInstr &A, const MachineInstr &B) const; static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII, const CombineInfo &Paired); @@ -199,38 +215,43 @@ private: const CombineInfo &Paired); const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; - bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl<MachineInstr *> &InstsToMove); + CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, - CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); + MachineBasicBlock::iterator + mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore); void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; @@ -252,6 +273,12 @@ private: MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, std::list<std::list<CombineInfo>> &MergeableInsts) const; + static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired); + + static InstClassEnum getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired); + public: static char ID; @@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_STORE_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX2: return 2; + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX3: + return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4: + case AMDGPU::FLAT_STORE_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return 8; @@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return FLAT_LOAD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return GLOBAL_LOAD_SADDR; + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + return FLAT_STORE; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return GLOBAL_STORE_SADDR; } } /// Determines instruction subclass from opcode. Only instructions -/// of the same subclass can be merged together. +/// of the same subclass can be merged together. The merged instruction may have +/// a different subclass but must have the same class. static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { switch (Opc) { default: @@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + return AMDGPU::FLAT_LOAD_DWORD; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + return AMDGPU::FLAT_STORE_DWORD; + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + return AMDGPU::GLOBAL_STORE_DWORD_SADDR; } } +// GLOBAL loads and stores are classified as FLAT initially. If both combined +// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. +// If either or both instructions are non segment specific FLAT the resulting +// combined operation will be FLAT, potentially promoting one of the GLOBAL +// operations to FLAT. +// For other instructions return the original unmodified class. +InstClassEnum +SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, + const CombineInfo &Paired) { + assert(CI.InstClass == Paired.InstClass); + + if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && + SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) + return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; + + return CI.InstClass; +} + static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { AddressRegs Result; @@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64_gfx9: Result.Addr = true; return Result; + case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: + case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: + case AMDGPU::GLOBAL_STORE_DWORD_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: + case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + Result.SAddr = true; + LLVM_FALLTHROUGH; + case AMDGPU::GLOBAL_LOAD_DWORD: + case AMDGPU::GLOBAL_LOAD_DWORDX2: + case AMDGPU::GLOBAL_LOAD_DWORDX3: + case AMDGPU::GLOBAL_LOAD_DWORDX4: + case AMDGPU::GLOBAL_STORE_DWORD: + case AMDGPU::GLOBAL_STORE_DWORDX2: + case AMDGPU::GLOBAL_STORE_DWORDX3: + case AMDGPU::GLOBAL_STORE_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORD: + case AMDGPU::FLAT_LOAD_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX4: + case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_STORE_DWORDX4: + Result.VAddr = true; + return Result; } } @@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (Regs.SOffset) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); + if (Regs.SAddr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); if (Regs.VAddr) AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); @@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { return new SILoadStoreOptimizer(); } -static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef<MachineInstr *> InstsToMove) { - MachineBasicBlock *MBB = I->getParent(); - ++I; - for (MachineInstr *MI : InstsToMove) { - MI->removeFromParent(); - MBB->insert(I, MI); - } -} - static void addDefsUsesToList(const MachineInstr &MI, DenseSet<Register> &RegDefs, - DenseSet<Register> &PhysRegUses) { - for (const MachineOperand &Op : MI.operands()) { - if (Op.isReg()) { - if (Op.isDef()) - RegDefs.insert(Op.getReg()); - else if (Op.readsReg() && Op.getReg().isPhysical()) - PhysRegUses.insert(Op.getReg()); - } + DenseSet<Register> &RegUses) { + for (const auto &Op : MI.operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + if (Op.readsReg()) + RegUses.insert(Op.getReg()); } } -static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, - MachineBasicBlock::iterator B, - AliasAnalysis *AA) { - // RAW or WAR - cannot reorder - // WAW - cannot reorder - // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); -} - -// Add MI and its defs to the lists if MI reads one of the defs that are -// already in the list. Returns true in that case. -static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, - DenseSet<Register> &PhysRegUses, - SmallVectorImpl<MachineInstr *> &Insts) { - for (MachineOperand &Use : MI.operands()) { - // If one of the defs is read, then there is a use of Def between I and the - // instruction that I will potentially be merged with. We will need to move - // this instruction after the merged instructions. - // - // Similarly, if there is a def which is read by an instruction that is to - // be moved for merging, then we need to move the def-instruction as well. - // This can only happen for physical registers such as M0; virtual - // registers are in SSA form. - if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || - (Use.isDef() && RegDefs.count(Use.getReg())) || - (Use.isDef() && Use.getReg().isPhysical() && - PhysRegUses.count(Use.getReg())))) { - Insts.push_back(&MI); - addDefsUsesToList(MI, RegDefs, PhysRegUses); - return true; - } - } - - return false; -} - -static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef<MachineInstr *> InstsToMove, - AliasAnalysis *AA) { - assert(MemOp.mayLoadOrStore()); - - for (MachineInstr *InstToMove : InstsToMove) { - if (!InstToMove->mayLoadOrStore()) +bool SILoadStoreOptimizer::canSwapInstructions( + const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, + const MachineInstr &A, const MachineInstr &B) const { + if (A.mayLoadOrStore() && B.mayLoadOrStore() && + (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) + return false; + for (const auto &BOp : B.operands()) { + if (!BOp.isReg()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) + if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) + return false; + if (BOp.isDef() && ARegUses.contains(BOp.getReg())) return false; } return true; } -// This function assumes that \p A and \p B have are identical except for -// size and offset, and they reference adjacent memory. -static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, - const MachineMemOperand *A, - const MachineMemOperand *B) { - unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); - unsigned Size = A->getSize() + B->getSize(); - // This function adds the offset parameter to the existing offset for A, - // so we pass 0 here as the offset and then manually set it to the correct - // value after the call. - MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); - MMO->setOffset(MinOffset); - return MMO; +// Given that \p CI and \p Paired are adjacent memory operations produce a new +// MMO for the combined operation with a new access size. +MachineMemOperand * +SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, + const CombineInfo &Paired) { + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); + + unsigned Size = MMOa->getSize() + MMOb->getSize(); + + // A base pointer for the combined operation is the same as the leading + // operation's pointer. + if (Paired < CI) + std::swap(MMOa, MMOb); + + MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); + // If merging FLAT and GLOBAL set address space to FLAT. + if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; + + MachineFunction *MF = CI.I->getMF(); + return MF->getMachineMemOperand(MMOa, PtrInfo, Size); } bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, @@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && - CI.CPol == Paired.CPol && - (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); + CI.CPol == Paired.CPol; } // If the offset in elements doesn't fit in 8-bits, we might be able to use @@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { return nullptr; } -/// This function assumes that CI comes before Paired in a basic block. -bool SILoadStoreOptimizer::checkAndPrepareMerge( - CombineInfo &CI, CombineInfo &Paired, - SmallVectorImpl<MachineInstr *> &InstsToMove) { +/// This function assumes that CI comes before Paired in a basic block. Return +/// an insertion point for the merged instruction or nullptr on failure. +SILoadStoreOptimizer::CombineInfo * +SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, + CombineInfo &Paired) { + // If another instruction has already been merged into CI, it may now be a + // type that we can't do any further merging into. + if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) + return nullptr; + assert(CI.InstClass == Paired.InstClass); + + if (getInstSubclass(CI.I->getOpcode(), *TII) != + getInstSubclass(Paired.I->getOpcode(), *TII)) + return nullptr; // Check both offsets (or masks for MIMG) can be combined and fit in the // reduced range. - if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) - return false; - - if (CI.InstClass != MIMG && - (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) - return false; - - const unsigned Opc = CI.I->getOpcode(); - const InstClassEnum InstClass = getInstClass(Opc, *TII); - - if (InstClass == UNKNOWN) { - return false; + if (CI.InstClass == MIMG) { + if (!dmasksCanBeCombined(CI, *TII, Paired)) + return nullptr; + } else { + if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) + return nullptr; } - const unsigned InstSubclass = getInstSubclass(Opc, *TII); - - DenseSet<Register> RegDefsToMove; - DenseSet<Register> PhysRegUsesToMove; - addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - - MachineBasicBlock::iterator E = std::next(Paired.I); - MachineBasicBlock::iterator MBBI = std::next(CI.I); - MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); - for (; MBBI != E; ++MBBI) { - - if (MBBI == MBBE) { - // CombineInfo::Order is a hint on the instruction ordering within the - // basic block. This hint suggests that CI precedes Paired, which is - // true most of the time. However, moveInstsAfter() processing a - // previous list may have changed this order in a situation when it - // moves an instruction which exists in some other merge list. - // In this case it must be dependent. - return false; - } - if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || - (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { - // This is not a matching instruction, but we can keep looking as - // long as one of these conditions are met: - // 1. It is safe to move I down past MBBI. - // 2. It is safe to move MBBI down past the instruction that I will - // be merged into. - - if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { - // We fail condition #1, but we may still be able to satisfy condition - // #2. Add this instruction to the move list and then we will check - // if condition #2 holds once we have selected the matching instruction. - InstsToMove.push_back(&*MBBI); - addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); - continue; - } - - // When we match I with another DS instruction we will be moving I down - // to the location of the matched instruction any uses of I will need to - // be moved down as well. - addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - InstsToMove); - continue; + DenseSet<Register> RegDefs; + DenseSet<Register> RegUses; + CombineInfo *Where; + if (CI.I->mayLoad()) { + // Try to hoist Paired up to CI. + addDefsUsesToList(*Paired.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) + return nullptr; } - - // Handle a case like - // DS_WRITE_B32 addr, v, idx0 - // w = DS_READ_B32 addr, idx0 - // DS_WRITE_B32 addr, f(w), idx1 - // where the DS_READ_B32 ends up in InstsToMove and therefore prevents - // merging of the two writes. - if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - InstsToMove)) - continue; - - if (&*MBBI == &*Paired.I) { - // We need to go through the list of instructions that we plan to - // move and make sure they are all safe to move down past the merged - // instruction. - if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { - - // Call offsetsCanBeCombined with modify = true so that the offsets are - // correct for the new instruction. This should return true, because - // this function should only be called on CombineInfo objects that - // have already been confirmed to be mergeable. - if (CI.InstClass != MIMG) - offsetsCanBeCombined(CI, *STM, Paired, true); - return true; - } - return false; + Where = &CI; + } else { + // Try to sink CI down to Paired. + addDefsUsesToList(*CI.I, RegDefs, RegUses); + for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { + if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) + return nullptr; } - - // We've found a load/store that we couldn't merge for some reason. - // We could potentially keep looking, but we'd need to make sure that - // it was safe to move I and also all the instruction in InstsToMove - // down past this instruction. - // check if we can move I across MBBI and if we can move all I's users - if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) - break; + Where = &Paired; } - return false; + + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) + offsetsCanBeCombined(CI, *STM, Paired, true); + return Where; } unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { @@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, } MachineInstrBuilder Read2 = - BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) + BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 @@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, unsigned BaseRegFlags = 0; if (CI.BaseOff) { Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) + TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) .addReg(ImmReg) .addReg(AddrReg->getReg(), 0, BaseSubReg) .addImm(0); // clamp bit @@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, } MachineInstrBuilder Write2 = - BuildMI(*MBB, Paired.I, DL, Write2Desc) + BuildMI(*MBB, InsertBefore, DL, Write2Desc) .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr .add(*Data0) // data0 .add(*Data1) // data1 @@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - moveInstsAfter(Write2, InstsToMove); - CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, unsigned DMaskIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { if (I == DMaskIdx) MIB.addImm(MergedDMask); @@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - - MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); unsigned SubRegIdx0, SubRegIdx1; std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); @@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = - BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.CPol) // cpol - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.CPol) // cpol + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, CopyDesc) + BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) - .add(*Dest1) - .addReg(DestReg, RegState::Kill, SubRegIdx1); - - moveInstsAfter(Copy1, InstsToMove); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand( - combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); - moveInstsAfter(MIB, InstsToMove); + const unsigned Opcode = getNewOpcode(CI, Paired); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register DestReg = MRI->createVirtualRegister(SuperRC); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); + + if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) + MIB.add(*SAddr); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); + const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); + + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + BuildMI(*MBB, InsertBefore, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + CI.I->eraseFromParent(); + Paired.I->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( + CombineInfo &CI, CombineInfo &Paired, + MachineBasicBlock::iterator InsertBefore) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI, Paired); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); + Register SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); + + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) + .addReg(SrcReg, RegState::Kill); + + if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) + MIB.add(*SAddr); + + MachineInstr *New = + MIB.addImm(std::min(CI.Offset, Paired.Offset)) + .addImm(CI.CPol) + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired) { const unsigned Width = CI.Width + Paired.Width; - switch (CI.InstClass) { + switch (getCommonInstClass(CI, Paired)) { default: assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); // FIXME: Handle d16 correctly @@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case GLOBAL_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4; + } + case GLOBAL_LOAD_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; + case 3: + return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; + case 4: + return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; + } + case GLOBAL_STORE: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_STORE_DWORDX2; + case 3: + return AMDGPU::GLOBAL_STORE_DWORDX3; + case 4: + return AMDGPU::GLOBAL_STORE_DWORDX4; + } + case GLOBAL_STORE_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; + case 3: + return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; + case 4: + return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; + } + case FLAT_LOAD: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_LOAD_DWORDX2; + case 3: + return AMDGPU::FLAT_LOAD_DWORDX3; + case 4: + return AMDGPU::FLAT_LOAD_DWORDX4; + } + case FLAT_STORE: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_STORE_DWORDX2; + case 3: + return AMDGPU::FLAT_STORE_DWORDX3; + case 4: + return AMDGPU::FLAT_STORE_DWORDX4; + } case MIMG: assert((countPopulation(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - bool ReverseOrder; - if (CI.InstClass == MIMG) { - assert( - (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && - "No overlaps"); - ReverseOrder = CI.DMask > Paired.DMask; - } else { - ReverseOrder = CI.Offset > Paired.Offset; - } + assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == + CI.Width + Paired.Width)) && + "No overlaps"); unsigned Idx0; unsigned Idx1; @@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, assert(CI.Width >= 1 && CI.Width <= 4); assert(Paired.Width >= 1 && Paired.Width <= 4); - if (ReverseOrder) { + if (Paired < CI) { Idx1 = Idxs[0][Paired.Width - 1]; Idx0 = Idxs[Paired.Width][CI.Width - 1]; } else { @@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, - const SmallVectorImpl<MachineInstr *> &InstsToMove) { + MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); - BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) .add(*Src0) .addImm(SubRegIdx0) .add(*Src1) .addImm(SubRegIdx1); - auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) + auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); AddressRegs Regs = getRegs(Opcode, *TII); @@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( // will return true if this is the case. assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); - const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); - const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); - MachineInstr *New = MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(CI.CPol) // cpol .addImm(0) // tfe .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - - moveInstsAfter(MIB, InstsToMove); + .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 // as the new-base(anchor) because of the maximum distance which can - // accomodate more intermediate bases presumeably. + // accommodate more intermediate bases presumably. // // Step3: move (&a + 8192) above load1. Compute and promote offsets from // (&a + 8192) for load1, load2, load4. @@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( CombineInfo &CI = *First; CombineInfo &Paired = *Second; - SmallVector<MachineInstr *, 8> InstsToMove; - if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { + CombineInfo *Where = checkAndPrepareMerge(CI, Paired); + if (!Where) { ++I; continue; } @@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); + MachineBasicBlock::iterator NewMI; switch (CI.InstClass) { default: llvm_unreachable("unknown InstClass"); break; - case DS_READ: { - MachineBasicBlock::iterator NewMI = - mergeRead2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); + case DS_READ: + NewMI = mergeRead2Pair(CI, Paired, Where->I); break; - } - case DS_WRITE: { - MachineBasicBlock::iterator NewMI = - mergeWrite2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); + case DS_WRITE: + NewMI = mergeWrite2Pair(CI, Paired, Where->I); break; - } - case S_BUFFER_LOAD_IMM: { - MachineBasicBlock::iterator NewMI = - mergeSBufferLoadImmPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 8; + case S_BUFFER_LOAD_IMM: + NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 8; break; - } - case BUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = - mergeBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case BUFFER_LOAD: + NewMI = mergeBufferLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case BUFFER_STORE: { - MachineBasicBlock::iterator NewMI = - mergeBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case BUFFER_STORE: + NewMI = mergeBufferStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case MIMG: { - MachineBasicBlock::iterator NewMI = - mergeImagePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case MIMG: + NewMI = mergeImagePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case TBUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = - mergeTBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case TBUFFER_LOAD: + NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; - } - case TBUFFER_STORE: { - MachineBasicBlock::iterator NewMI = - mergeTBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *this); - OptimizeListAgain |= (CI.Width + Paired.Width) < 4; + case TBUFFER_STORE: + NewMI = mergeTBufferStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; + case FLAT_LOAD: + case GLOBAL_LOAD: + case GLOBAL_LOAD_SADDR: + NewMI = mergeFlatLoadPair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; + break; + case FLAT_STORE: + case GLOBAL_STORE: + case GLOBAL_STORE_SADDR: + NewMI = mergeFlatStorePair(CI, Paired, Where->I); + OptimizeListAgain |= CI.Width + Paired.Width < 4; break; } - } - CI.Order = Paired.Order; + CI.setMI(NewMI, *this); + CI.Order = Where->Order; if (I == Second) I = Next; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index e1018bdfde46..607383ab8cde 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -509,8 +509,35 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) .addReg(Exec) .add(MI.getOperand(0)); - if (LV) - LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI); + if (LV) { + LV->replaceKillInstruction(DataReg, MI, *NewMI); + + if (SplitBB != &MBB) { + // Track the set of registers defined in the split block so we don't + // accidentally add the original block to AliveBlocks. + DenseSet<Register> SplitDefs; + for (MachineInstr &X : *SplitBB) { + for (MachineOperand &Op : X.operands()) { + if (Op.isReg() && Op.isDef() && Op.getReg().isVirtual()) + SplitDefs.insert(Op.getReg()); + } + } + + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + Register Reg = Register::index2VirtReg(i); + LiveVariables::VarInfo &VI = LV->getVarInfo(Reg); + + if (VI.AliveBlocks.test(MBB.getNumber())) + VI.AliveBlocks.set(SplitBB->getNumber()); + else { + for (MachineInstr *Kill : VI.Kills) { + if (Kill->getParent() == SplitBB && !SplitDefs.contains(Reg)) + VI.AliveBlocks.set(MBB.getNumber()); + } + } + } + } + } LoweredEndCf.insert(NewMI); @@ -540,7 +567,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, return; // Make sure we do not modify exec between def and use. - // A copy with implcitly defined exec inserted earlier is an exclusion, it + // A copy with implicitly defined exec inserted earlier is an exclusion, it // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) if (I->modifiesRegister(AMDGPU::EXEC, TRI) && @@ -573,14 +600,14 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { else return; Register Reg = MI.getOperand(OpToReplace).getReg(); - MI.RemoveOperand(OpToReplace); + MI.removeOperand(OpToReplace); MI.addOperand(Ops[UniqueOpndIdx]); if (MRI->use_empty(Reg)) MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } void SILowerControlFlow::optimizeEndCf() { - // If the only instruction immediately following this END_CF is an another + // If the only instruction immediately following this END_CF is another // END_CF in the only successor we can avoid emitting exec mask restore here. if (!EnableOptimizeEndCf) return; @@ -865,6 +892,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } } + bool Changed = false; MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(); BI != MF.end(); BI = NextBB) { @@ -886,6 +914,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_LOOP: case AMDGPU::SI_END_CF: SplitMBB = process(MI); + Changed = true; break; // FIXME: find a better place for this @@ -894,6 +923,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { lowerInitExec(MBB, MI); if (LIS) LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + Changed = true; break; default: @@ -913,5 +943,5 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LoweredIf.clear(); KillBlocks.clear(); - return true; + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 672266f0c11e..5fb545b50228 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -79,9 +79,9 @@ public: } private: - void lowerCopiesFromI1(); - void lowerPhis(); - void lowerCopiesToI1(); + bool lowerCopiesFromI1(); + bool lowerPhis(); + bool lowerCopiesToI1(); bool isConstantLaneMask(Register Reg, bool &Val) const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, @@ -473,15 +473,17 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { OrN2Op = AMDGPU::S_ORN2_B64; } - lowerCopiesFromI1(); - lowerPhis(); - lowerCopiesToI1(); + bool Changed = false; + Changed |= lowerCopiesFromI1(); + Changed |= lowerPhis(); + Changed |= lowerCopiesToI1(); + assert(Changed || ConstrainRegs.empty()); for (unsigned Reg : ConstrainRegs) MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); ConstrainRegs.clear(); - return true; + return Changed; } #ifndef NDEBUG @@ -493,7 +495,8 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, } #endif -void SILowerI1Copies::lowerCopiesFromI1() { +bool SILowerI1Copies::lowerCopiesFromI1() { + bool Changed = false; SmallVector<MachineInstr *, 4> DeadCopies; for (MachineBasicBlock &MBB : *MF) { @@ -509,6 +512,8 @@ void SILowerI1Copies::lowerCopiesFromI1() { if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) continue; + Changed = true; + // Copy into a 32-bit vector register. LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); @@ -530,9 +535,10 @@ void SILowerI1Copies::lowerCopiesFromI1() { MI->eraseFromParent(); DeadCopies.clear(); } + return Changed; } -void SILowerI1Copies::lowerPhis() { +bool SILowerI1Copies::lowerPhis() { MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT); @@ -550,6 +556,8 @@ void SILowerI1Copies::lowerPhis() { Vreg1Phis.push_back(&MI); } } + if (Vreg1Phis.empty()) + return false; MachineBasicBlock *PrevMBB = nullptr; for (MachineInstr *MI : Vreg1Phis) { @@ -662,9 +670,11 @@ void SILowerI1Copies::lowerPhis() { IncomingRegs.clear(); IncomingUpdated.clear(); } + return true; } -void SILowerI1Copies::lowerCopiesToI1() { +bool SILowerI1Copies::lowerCopiesToI1() { + bool Changed = false; MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); SmallVector<MachineInstr *, 4> DeadCopies; @@ -681,6 +691,8 @@ void SILowerI1Copies::lowerCopiesToI1() { if (!isVreg1(DstReg)) continue; + Changed = true; + if (MRI->use_empty(DstReg)) { DeadCopies.push_back(&MI); continue; @@ -731,6 +743,7 @@ void SILowerI1Copies::lowerCopiesToI1() { MI->eraseFromParent(); DeadCopies.clear(); } + return Changed; } bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 0fbdbef6fcce..dd881ec42d53 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -79,6 +80,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { @@ -89,8 +92,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, MCRegister Reg = CS.getReg(); MachineInstrSpan MIS(I, &SaveBlock); - const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, MVT::i32); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); // If this value was already livein, we probably have a direct use of the // incoming register value, so don't kill at the spill point. This happens @@ -119,7 +122,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); // Restore all registers immediately before the return and any // terminators that precede it. MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); @@ -128,8 +132,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CI : reverse(CSI)) { Register Reg = CI.getReg(); - const TargetRegisterClass *RC = - TRI->getMinimalPhysRegClass(Reg, MVT::i32); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass( + Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); assert(I != RestoreBlock.begin() && @@ -321,7 +325,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // free frame index ids by the later pass(es) like "stack slot coloring" // which in turn could mess-up with the book keeping of "frame index to VGPR // lane". - FuncInfo->removeDeadFrameIndices(MFI); + FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); MadeChange = true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index cca8565c9ff9..0504c59ebd9e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -31,6 +31,9 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + BufferPSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())), + ImagePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())), + GWSResourcePSV(static_cast<const AMDGPUTargetMachine &>(MF.getTarget())), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -48,8 +51,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0), - GDSSize(0) { + HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -74,6 +76,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } + MayNeedAGPRs = ST.hasMAIInsts(); + if (!isEntryFunction()) { if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; @@ -97,6 +101,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitArgPtr = false; MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign); + + if (ST.hasGFX90AInsts() && + ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && + !mayUseAGPRs(MF)) + MayNeedAGPRs = false; // We will select all MAI with VGPR operands. } bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); @@ -177,9 +186,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); - S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); - if (!S.empty()) - S.consumeInteger(0, GDSSize); + // On GFX908, in order to guarantee copying between AGPRs, we need a scratch + // VGPR available at all times. For now, reserve highest available VGPR. After + // RA, shift it to the lowest available unused VGPR if the one exist. + if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { + VGPRForAGPRCopy = + AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1); + } +} + +MachineFunctionInfo *SIMachineFunctionInfo::clone( + BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) + const { + return DestMF.cloneInfo<SIMachineFunctionInfo>(*this); } void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { @@ -265,7 +285,7 @@ bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { - std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI]; + std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -320,7 +340,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI)); - // Add this register as live-in to all blocks to avoid machine verifer + // Add this register as live-in to all blocks to avoid machine verifier // complaining about use of an undefined physical register. for (MachineBasicBlock &BB : MF) BB.addLiveIn(LaneVGPR); @@ -328,7 +348,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, LaneVGPR = SpillVGPRs.back().VGPR; } - SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); + SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex)); } return true; @@ -402,7 +422,8 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, return Spill.FullyAllocated; } -void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { +bool SIMachineFunctionInfo::removeDeadFrameIndices( + MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { // Remove dead frame indices from function frame, however keep FP & BP since // spills for them haven't been inserted yet. And also make sure to remove the // frame indices from `SGPRToVGPRSpills` data structure, otherwise, it could @@ -415,17 +436,42 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { } } - // All other SPGRs must be allocated on the default stack, so reset the stack - // ID. - for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; - ++i) - if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) - MFI.setStackID(i, TargetStackID::Default); + bool HaveSGPRToMemory = false; + + if (ResetSGPRSpillStackIDs) { + // All other SPGRs must be allocated on the default stack, so reset the + // stack ID. + for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; + ++i) { + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) { + if (MFI.getStackID(i) == TargetStackID::SGPRSpill) { + MFI.setStackID(i, TargetStackID::Default); + HaveSGPRToMemory = true; + } + } + } + } for (auto &R : VGPRToAGPRSpills) { if (R.second.IsDead) MFI.RemoveStackObject(R.first); } + + return HaveSGPRToMemory; +} + +void SIMachineFunctionInfo::allocateWWMReservedSpillSlots( + MachineFrameInfo &MFI, const SIRegisterInfo &TRI) { + assert(WWMReservedFrameIndexes.empty()); + + WWMReservedFrameIndexes.resize(WWMReservedRegs.size()); + + int I = 0; + for (Register VGPR : WWMReservedRegs) { + const TargetRegisterClass *RC = TRI.getPhysRegClass(VGPR); + WWMReservedFrameIndexes[I++] = MFI.CreateSpillStackObject( + TRI.getSpillSize(*RC), TRI.getSpillAlign(*RC)); + } } int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, @@ -539,6 +585,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( const llvm::MachineFunction &MF) : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), + GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), @@ -549,7 +596,14 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), + BytesInStackArgArea(MFI.getBytesInStackArgArea()), + ReturnsVoid(MFI.returnsVoid()), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) { + for (Register Reg : MFI.WWMReservedRegs) + WWMReservedRegs.push_back(regToString(Reg, TRI)); + + if (MFI.getVGPRForAGPRCopy()) + VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); @@ -563,8 +617,9 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) { ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; - MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign); + MaxKernArgAlign = YamlMFI.MaxKernArgAlign; LDSSize = YamlMFI.LDSSize; + GDSSize = YamlMFI.GDSSize; DynLDSAlign = YamlMFI.DynLDSAlign; HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; Occupancy = YamlMFI.Occupancy; @@ -574,6 +629,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + BytesInStackArgArea = YamlMFI.BytesInStackArgArea; + ReturnsVoid = YamlMFI.ReturnsVoid; if (YamlMFI.ScavengeFI) { auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo()); @@ -595,10 +652,47 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( return false; } +bool SIMachineFunctionInfo::mayUseAGPRs(const MachineFunction &MF) const { + for (const BasicBlock &BB : MF.getFunction()) { + for (const Instruction &I : BB) { + const auto *CB = dyn_cast<CallBase>(&I); + if (!CB) + continue; + + if (CB->isInlineAsm()) { + const InlineAsm *IA = dyn_cast<InlineAsm>(CB->getCalledOperand()); + for (const auto &CI : IA->ParseConstraints()) { + for (StringRef Code : CI.Codes) { + Code.consume_front("{"); + if (Code.startswith("a")) + return true; + } + } + continue; + } + + const Function *Callee = + dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); + if (!Callee) + return true; + + if (Callee->getIntrinsicID() == Intrinsic::not_intrinsic) + return true; + } + } + + return false; +} + bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { if (UsesAGPRs) return *UsesAGPRs; + if (!mayNeedAGPRs()) { + UsesAGPRs = false; + return false; + } + if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) || MF.getFrameInfo().hasCalls()) { UsesAGPRs = true; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 8e821274bb77..bebb13cbf09f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -15,9 +15,10 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" +#include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" -#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/raw_ostream.h" @@ -39,8 +40,8 @@ public: }; protected: - AMDGPUPseudoSourceValue(unsigned Kind, const TargetInstrInfo &TII) - : PseudoSourceValue(Kind, TII) {} + AMDGPUPseudoSourceValue(unsigned Kind, const AMDGPUTargetMachine &TM) + : PseudoSourceValue(Kind, TM) {} public: bool isConstant(const MachineFrameInfo *) const override { @@ -60,8 +61,8 @@ public: class AMDGPUBufferPseudoSourceValue final : public AMDGPUPseudoSourceValue { public: - explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) - : AMDGPUPseudoSourceValue(PSVBuffer, TII) {} + explicit AMDGPUBufferPseudoSourceValue(const AMDGPUTargetMachine &TM) + : AMDGPUPseudoSourceValue(PSVBuffer, TM) {} static bool classof(const PseudoSourceValue *V) { return V->kind() == PSVBuffer; @@ -73,8 +74,8 @@ public: class AMDGPUImagePseudoSourceValue final : public AMDGPUPseudoSourceValue { public: // TODO: Is the img rsrc useful? - explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) - : AMDGPUPseudoSourceValue(PSVImage, TII) {} + explicit AMDGPUImagePseudoSourceValue(const AMDGPUTargetMachine &TM) + : AMDGPUPseudoSourceValue(PSVImage, TM) {} static bool classof(const PseudoSourceValue *V) { return V->kind() == PSVImage; @@ -85,8 +86,8 @@ public: class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue { public: - explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII) - : AMDGPUPseudoSourceValue(GWSResource, TII) {} + explicit AMDGPUGWSResourcePseudoSourceValue(const AMDGPUTargetMachine &TM) + : AMDGPUPseudoSourceValue(GWSResource, TM) {} static bool classof(const PseudoSourceValue *V) { return V->kind() == GWSResource; @@ -269,8 +270,9 @@ template <> struct MappingTraits<SIMode> { struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { uint64_t ExplicitKernArgSize = 0; - unsigned MaxKernArgAlign = 0; - unsigned LDSSize = 0; + Align MaxKernArgAlign; + uint32_t LDSSize = 0; + uint32_t GDSSize = 0; Align DynLDSAlign; bool IsEntryFunction = false; bool NoSignedZerosFPMath = false; @@ -283,13 +285,19 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { // TODO: 10 may be a better default since it's the maximum. unsigned Occupancy = 0; + SmallVector<StringValue> WWMReservedRegs; + StringValue ScratchRSrcReg = "$private_rsrc_reg"; StringValue FrameOffsetReg = "$fp_reg"; StringValue StackPtrOffsetReg = "$sp_reg"; + unsigned BytesInStackArgArea = 0; + bool ReturnsVoid = true; + Optional<SIArgumentInfo> ArgInfo; SIMode Mode; Optional<FrameIndex> ScavengeFI; + StringValue VGPRForAGPRCopy; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, @@ -304,8 +312,9 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { static void mapping(IO &YamlIO, SIMachineFunctionInfo &MFI) { YamlIO.mapOptional("explicitKernArgSize", MFI.ExplicitKernArgSize, UINT64_C(0)); - YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign, 0u); + YamlIO.mapOptional("maxKernArgAlign", MFI.MaxKernArgAlign); YamlIO.mapOptional("ldsSize", MFI.LDSSize, 0u); + YamlIO.mapOptional("gdsSize", MFI.GDSSize, 0u); YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align()); YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false); YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false); @@ -319,12 +328,17 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { StringValue("$fp_reg")); YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, StringValue("$sp_reg")); + YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u); + YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true); YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); YamlIO.mapOptional("mode", MFI.Mode, SIMode()); YamlIO.mapOptional("highBitsOf32BitAddress", MFI.HighBitsOf32BitAddress, 0u); YamlIO.mapOptional("occupancy", MFI.Occupancy, 0); + YamlIO.mapOptional("wwmReservedRegs", MFI.WWMReservedRegs); YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); + YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, + StringValue()); // Don't print out when it's empty. } }; @@ -335,8 +349,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { class SIMachineFunctionInfo final : public AMDGPUMachineFunction { friend class GCNTargetMachine; - Register TIDReg = AMDGPU::NoRegister; - // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; @@ -377,12 +389,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // unit. Minimum - first, maximum - second. std::pair<unsigned, unsigned> WavesPerEU = {0, 0}; - std::unique_ptr<const AMDGPUBufferPseudoSourceValue> BufferPSV; - std::unique_ptr<const AMDGPUImagePseudoSourceValue> ImagePSV; - std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV; + const AMDGPUBufferPseudoSourceValue BufferPSV; + const AMDGPUImagePseudoSourceValue ImagePSV; + const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV; private: - unsigned LDSWaveSpillSize = 0; unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; @@ -422,13 +433,14 @@ private: // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; + bool MayNeedAGPRs : 1; + // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since // current hardware only allows a 16 bit value. unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; - unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -440,17 +452,6 @@ private: MCPhysReg getNextSystemSGPR() const; public: - struct SpilledReg { - Register VGPR; - int Lane = -1; - - SpilledReg() = default; - SpilledReg(Register R, int L) : VGPR (R), Lane (L) {} - - bool hasLane() { return Lane != -1;} - bool hasReg() { return VGPR != 0;} - }; - struct SGPRSpillVGPR { // VGPR used for SGPR spills Register VGPR; @@ -468,14 +469,28 @@ public: bool IsDead = false; }; - // Map WWM VGPR to a stack slot that is used to save/restore it in the - // prolog/epilog. - MapVector<Register, Optional<int>> WWMReservedRegs; + // Track VGPRs reserved for WWM. + SmallSetVector<Register, 8> WWMReservedRegs; + + /// Track stack slots used for save/restore of reserved WWM VGPRs in the + /// prolog/epilog. + + /// FIXME: This is temporary state only needed in PrologEpilogInserter, and + /// doesn't really belong here. It does not require serialization + SmallVector<int, 8> WWMReservedFrameIndexes; + + void allocateWWMReservedSpillSlots(MachineFrameInfo &MFI, + const SIRegisterInfo &TRI); + + auto wwmAllocation() const { + assert(WWMReservedRegs.size() == WWMReservedFrameIndexes.size()); + return zip(WWMReservedRegs, WWMReservedFrameIndexes); + } private: // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. - DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; + DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills; unsigned NumVGPRSpillLanes = 0; SmallVector<SGPRSpillVGPR, 2> SpillVGPRs; @@ -491,6 +506,18 @@ private: // frame, so save it here and add it to the RegScavenger later. Optional<int> ScavengeFI; +private: + Register VGPRForAGPRCopy; + +public: + Register getVGPRForAGPRCopy() const { + return VGPRForAGPRCopy; + } + + void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy) { + VGPRForAGPRCopy = NewVGPRForAGPRCopy; + } + public: // FIXME /// If this is set, an SGPR used for save/restore of the register used for the /// frame pointer. @@ -506,31 +533,32 @@ public: // FIXME public: SIMachineFunctionInfo(const MachineFunction &MF); + SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default; + + MachineFunctionInfo * + clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, + const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) + const override; bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange); - void reserveWWMRegister(Register Reg, Optional<int> FI) { - WWMReservedRegs.insert(std::make_pair(Reg, FI)); + void reserveWWMRegister(Register Reg) { + WWMReservedRegs.insert(Reg); } - ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { + ArrayRef<SIRegisterInfo::SpilledReg> + getSGPRToVGPRSpills(int FrameIndex) const { auto I = SGPRToVGPRSpills.find(FrameIndex); - return (I == SGPRToVGPRSpills.end()) ? - ArrayRef<SpilledReg>() : makeArrayRef(I->second); + return (I == SGPRToVGPRSpills.end()) + ? ArrayRef<SIRegisterInfo::SpilledReg>() + : makeArrayRef(I->second); } ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; } - void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) { - SpillVGPRs[Index].VGPR = NewVGPR; - SpillVGPRs[Index].FI = newFI; - } - - bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF); - ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const { return SpillAGPR; } @@ -555,15 +583,15 @@ public: unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); - void removeDeadFrameIndices(MachineFrameInfo &MFI); + + /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill + /// to the default stack. + bool removeDeadFrameIndices(MachineFrameInfo &MFI, + bool ResetSGPRSpillStackIDs); int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI); Optional<int> getOptionalScavengeFI() const { return ScavengeFI; } - bool hasCalculatedTID() const { return TIDReg != 0; }; - Register getTIDReg() const { return TIDReg; }; - void setTIDReg(Register Reg) { TIDReg = Reg; } - unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } @@ -581,6 +609,13 @@ public: Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); + /// Increment user SGPRs used for padding the argument list only. + Register addReservedUserSGPR() { + Register Next = getNextUserSGPR(); + ++NumUserSGPRs; + return Next; + } + // Add system SGPRs. Register addWorkGroupIDX() { ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); @@ -722,10 +757,6 @@ public: return HighBitsOf32BitAddress; } - unsigned getGDSSize() const { - return GDSSize; - } - unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -903,31 +934,19 @@ public: llvm_unreachable("unexpected dimension"); } - unsigned getLDSWaveSpillSize() const { - return LDSWaveSpillSize; + const AMDGPUBufferPseudoSourceValue * + getBufferPSV(const AMDGPUTargetMachine &TM) { + return &BufferPSV; } - const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII) { - if (!BufferPSV) - BufferPSV = std::make_unique<AMDGPUBufferPseudoSourceValue>(TII); - - return BufferPSV.get(); - } - - const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII) { - if (!ImagePSV) - ImagePSV = std::make_unique<AMDGPUImagePseudoSourceValue>(TII); - - return ImagePSV.get(); + const AMDGPUImagePseudoSourceValue * + getImagePSV(const AMDGPUTargetMachine &TM) { + return &ImagePSV; } - const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { - if (!GWSResourcePSV) { - GWSResourcePSV = - std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII); - } - - return GWSResourcePSV.get(); + const AMDGPUGWSResourcePseudoSourceValue * + getGWSPSV(const AMDGPUTargetMachine &TM) { + return &GWSResourcePSV; } unsigned getOccupancy() const { @@ -953,6 +972,14 @@ public: limitOccupancy(MF); } + bool mayNeedAGPRs() const { + return MayNeedAGPRs; + } + + // \returns true if a function has a use of AGPRs via inline asm or + // has a call which may use it. + bool mayUseAGPRs(const MachineFunction &MF) const; + // \returns true if a function needs or may need AGPRs. bool usesAGPRs(const MachineFunction &MF) const; }; diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 81db66a98ddf..e426e938b856 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -64,7 +64,7 @@ using namespace llvm; // First the instructions are put into blocks. // We want the blocks help control register usage and hide high latencies // later. To help control register usage, we typically want all local -// computations, when for example you create a result that can be comsummed +// computations, when for example you create a result that can be consumed // right away, to be contained in a block. Block inputs and outputs would // typically be important results that are needed in several locations of // the shader. Since we do want blocks to help hide high latencies, we want @@ -90,8 +90,8 @@ using namespace llvm; // Increasing the number of active wavefronts helps hide the former, but it // doesn't solve the latter, thus why even if wavefront count is high, we have // to try have as many instructions hiding high latencies as possible. -// The OpenCL doc says for example latency of 400 cycles for a global mem access, -// which is hidden by 10 instructions if the wavefront count is 10. +// The OpenCL doc says for example latency of 400 cycles for a global mem +// access, which is hidden by 10 instructions if the wavefront count is 10. // Some figures taken from AMD docs: // Both texture and constant L1 caches are 4-way associative with 64 bytes @@ -353,7 +353,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // able to correctly handle 5 vs 6, 2 vs 3. // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 - // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { Register Reg = RegMaskPair.RegUnit; @@ -402,7 +402,7 @@ void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, nodeScheduled(SU); } - // TODO: compute InternalAdditionnalPressure. + // TODO: compute InternalAdditionalPressure. InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size()); // Check everything is right. @@ -696,7 +696,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() { bool HasSubGraph; std::vector<int> SubGraph; // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary // in the parent graph of SU. #ifndef NDEBUG SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], @@ -1123,36 +1123,26 @@ void SIScheduleBlockCreator::colorExports() { for (unsigned SUNum : DAG->TopDownIndex2SU) { const SUnit &SU = DAG->SUnits[SUNum]; if (SIInstrInfo::isEXP(*SU.getInstr())) { - // Check the EXP can be added to the group safely, - // ie without needing any other instruction. - // The EXP is allowed to depend on other EXP - // (they will be in the same group). - for (unsigned j : ExpGroup) { - bool HasSubGraph; - std::vector<int> SubGraph; - // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary - // in the parent graph of SU. -#ifndef NDEBUG - SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], - HasSubGraph); - assert(!HasSubGraph); -#endif - SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, - HasSubGraph); - if (!HasSubGraph) - continue; // No dependencies between each other + // SU is an export instruction. Check whether one of its successor + // dependencies is a non-export, in which case we skip export grouping. + for (const SDep &SuccDep : SU.Succs) { + const SUnit *SuccSU = SuccDep.getSUnit(); + if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) { + // Ignore these dependencies. + continue; + } + assert(SuccSU->isInstr() && + "SUnit unexpectedly not representing an instruction!"); - // SubGraph contains all the instructions required - // between EXP SUnits[j] and EXP SU. - for (unsigned k : SubGraph) { - if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) - // Other instructions than EXP would be required in the group. - // Abort the groupping. - return; + if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) { + // A non-export depends on us. Skip export grouping. + // Note that this is a bit pessimistic: We could still group all other + // exports that are not depended on by non-exports, directly or + // indirectly. Simply skipping this particular export but grouping all + // others would not account for indirect dependencies. + return; } } - ExpGroup.push_back(SUNum); } } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index fff4f6729c99..8a66213931ff 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -19,6 +19,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/TargetParser.h" @@ -63,7 +64,7 @@ enum class SIAtomicScope { }; /// The distinct address spaces supported by the AMDGPU target for -/// atomic memory operation. Can be ORed toether. +/// atomic memory operation. Can be ORed together. enum class SIAtomicAddrSpace { NONE = 0u, GLOBAL = 1u << 0, @@ -459,6 +460,56 @@ public: Position Pos) const override; }; +class SIGfx940CacheControl : public SIGfx90ACacheControl { +protected: + + /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::SC0); + } + + /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::SC1); + } + + /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableNTBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI, AMDGPU::CPol::NT); + } + +public: + + SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; + + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIGfx10CacheControl : public SIGfx7CacheControl { protected: @@ -494,6 +545,20 @@ public: Position Pos) const override; }; +class SIGfx11CacheControl : public SIGfx10CacheControl { +public: + SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -649,7 +714,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( return None; } - SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); + SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID(); Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); assert(MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); @@ -668,7 +733,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( return None; } std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = - ScopeOrNone.getValue(); + *ScopeOrNone; if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) || ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) { @@ -730,7 +795,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; bool IsCrossAddressSpaceOrdering = false; std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = - ScopeOrNone.getValue(); + *ScopeOrNone; if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { @@ -775,13 +840,17 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, /* static */ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); + if (ST.hasGFX940Insts()) + return std::make_unique<SIGfx940CacheControl>(ST); if (ST.hasGFX90AInsts()) return std::make_unique<SIGfx90ACacheControl>(ST); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return std::make_unique<SIGfx6CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX10) return std::make_unique<SIGfx7CacheControl>(ST); - return std::make_unique<SIGfx10CacheControl>(ST); + if (Generation < AMDGPUSubtarget::GFX11) + return std::make_unique<SIGfx10CacheControl>(ST); + return std::make_unique<SIGfx11CacheControl>(ST); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -943,7 +1012,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: // The LDS keeps all memory operations in order for - // the same wavesfront. + // the same wavefront. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1360,7 +1429,9 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, // to initiate writeback of any dirty cache lines of earlier writes by the // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT // vmcnt(0)" needed by the "BUFFER_WBL2". Changed = true; @@ -1386,6 +1457,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx940CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. Setting SC + // bits to indicate work-group scope will do this automatically. + Changed |= enableSC0Bit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx940CacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + // Set SC bits to indicate agent scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::WORKGROUP: + // Set SC bits to indicate workgroup scope. + Changed |= enableSC0Bit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Leave SC bits unset to indicate wavefront scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx940CacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Set SC1 bit to indicate system scope. + Changed |= enableSC1Bit(MI); + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // RMW atomic operations implicitly bypass the L1 cache and only use SC1 + // to indicate system or agent scope. The SC0 bit is used to indicate if + // they are return or no-return. Leave SC1 bit unset to indicate agent + // scope. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + return Changed; +} + +bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + // Set SC bits to indicate system scope. + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + + return Changed; + } + + if (IsNonTemporal) { + Changed |= enableNTBit(MI); + return Changed; + } + + return Changed; +} + +bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!InsertCacheInv) + return false; + + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and + // CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::AGENT: + // Ensures that following loads will not see stale remote date or local + // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale + // due to the memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to invalidate the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be invalidated. + if (ST.isTgSplitEnabled()) { + // Ensures L1 is invalidated if in threadgroup split mode. In + // non-threadgroup split mode it is a NOP, but no point generating it in + // that case if know not in that mode. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV)) + // Set SC bits to indicate work-group scope. + .addImm(AMDGPU::CPol::SC0); + // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware + // does not reorder memory operations with respect to preceeding buffer + // invalidate. The invalidate is guaranteed to remove any cache lines of + // earlier writes and ensures later writes will refetch the cache lines. + Changed = true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Could generate "BUFFER_INV" but it would do nothing as there are no + // caches to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + +bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by the + // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate system scope. + .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1); + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::SYSTEM, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2". + Changed = true; + break; + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)) + // Set SC bits to indicate agent scope. + .addImm(AMDGPU::CPol::SC1); + + // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is + // SIAtomicScope::AGENT, the following insertWait will generate the + // required "S_WAITCNT vmcnt(0)". + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Do not generate "BUFFER_WBL2" as there are no caches it would + // writeback, and would require an otherwise unnecessary + // "S_WAITCNT vmcnt(0)". + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (Pos == Position::AFTER) + --MI; + + // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other + // S_WAITCNT needed. + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos); + + return Changed; +} + bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -1547,7 +1920,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: // The LDS keeps all memory operations in order for - // the same wavesfront. + // the same wavefront. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1655,6 +2028,101 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, return Changed; } +bool SIGfx11CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // Set the L0 and L1 cache policies to MISS_EVICT. + // Note: there is no L2 cache coherent bypass control at the ISA level. + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in + // CU mode all waves of a work-group are on the same CU, and so the L0 + // does not need to be bypassed. + if (!ST.isCuModeEnabled()) + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + // Set L0 and L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache coherent bypass control at the ISA level. + if (Op == SIMemOp::LOAD) + Changed |= enableGLCBit(MI); + + // Set MALL NOALLOC for load and store instructions. + Changed |= enableDLCBit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + return Changed; + } + + if (IsNonTemporal) { + // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT + // and L2 cache policy to STREAM. + // For stores setting both GLC and SLC configures L0 and L1 cache policy + // to MISS_EVICT and the L2 cache policy to STREAM. + if (Op == SIMemOp::STORE) + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + + // Set MALL NOALLOC for load and store instructions. + Changed |= enableDLCBit(MI); + return Changed; + } + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 24a8879b5684..a5816e2e8c73 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -17,6 +17,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include <queue> #define DEBUG_TYPE "si-mode-register" @@ -162,7 +163,9 @@ FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } // double precision setting. Status SIModeRegister::getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII) { - if (TII->usesFPDPRounding(MI)) { + if (TII->usesFPDPRounding(MI) || + MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || + MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { switch (MI.getOpcode()) { case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: @@ -170,6 +173,18 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, // f16 interpolation instructions need double precision round to zero return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + case AMDGPU::FPTRUNC_UPWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF)); + } + case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: { + // Replacing the pseudo by a real instruction + MI.setDesc(TII->get(AMDGPU::V_CVT_F16_F32_e32)); + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF)); + } default: return DefaultStatus; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index b9c839fe28ba..5215397d5936 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -292,6 +293,210 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { return false; } +// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either +// the beginning of the BB is reached or Pred evaluates to true - which can be +// an arbitrary condition based on the current MachineInstr, for instance an +// target instruction. Breaks prematurely by returning nullptr if one of the +// registers given in NonModifiableRegs is modified by the current instruction. +static MachineInstr * +findInstrBackwards(MachineInstr &Origin, + std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, + const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) { + MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), + E = Origin.getParent()->rend(); + unsigned CurrentIteration = 0; + + for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { + if (A->isDebugInstr()) + continue; + + if (Pred(&*A)) + return &*A; + + for (MCRegister Reg : NonModifiableRegs) { + if (A->modifiesRegister(Reg, TRI)) + return nullptr; + } + + ++CurrentIteration; + } + + return nullptr; +} + + +// Determine if a register Reg is not re-defined and still in use +// in the range (Stop..Start]. +// It does so by backwards calculating liveness from the end of the BB until +// either Stop or the beginning of the BB is reached. +// After liveness is calculated, we can determine if Reg is still in use and not +// defined inbetween the instructions. +static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, + MCRegister Reg, const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI, + bool useLiveOuts = false, + bool ignoreStart = false) { + LivePhysRegs LR(*TRI); + if (useLiveOuts) + LR.addLiveOuts(*Stop.getParent()); + + MachineBasicBlock::reverse_iterator A(Start); + MachineBasicBlock::reverse_iterator E(Stop); + + if (ignoreStart) + ++A; + + for (; A != Stop.getParent()->rend() && A != Stop; ++A) { + LR.stepBackward(*A); + } + + return !LR.available(MRI, Reg); +} + +// Determine if a register Reg is not re-defined and still in use +// in the range (Stop..BB.end]. +static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, + const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI) { + return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI, + MRI, true); +} + +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +static MachineInstr *findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, + const SIInstrInfo *TII, MachineRegisterInfo &MRI) { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}, TRI); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI, + false, true) || + isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector<MCRegister, 2> NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs, TRI)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, MCRegister Exec, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + MachineRegisterInfo &MRI) { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; + unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + auto TryAddImmediateValueFromNamedOperand = + [&](unsigned OperandName) -> void { + if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) + Builder.addImm(Mod->getImm()); + }; + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); + Builder.add(*Src0); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); + Builder.add(*Src1); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); + + // The kill flags may no longer be correct. + if (Src0->isReg()) + MRI.clearKillFlags(Src0->getReg()); + if (Src1->isReg()) + MRI.clearKillFlags(Src1->getReg()); + + return true; +} + bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -299,6 +504,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Optimize sequences emitted for control flow lowering. They are originally @@ -312,6 +518,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { // x = s_<op>_saveexec_b64 y // + bool Changed = false; for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); @@ -351,6 +558,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); CopyToExecInst->eraseFromParent(); + Changed = true; } continue; @@ -456,8 +664,49 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, *TRI); } + + Changed = true; } - return true; + // After all s_op_saveexec instructions are inserted, + // replace (on GFX10.3 and later) + // v_cmp_* SGPR, IMM, VGPR + // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR + // with + // s_mov_b32 EXEC_SGPR_DEST, exec_lo + // v_cmpx_* IMM, VGPR + // to reduce pipeline stalls. + if (ST.hasGFX10_3Insts()) { + DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; + const unsigned AndSaveExecOpcode = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = + findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) + SaveExecVCmpMapping[&MI] = VCmp; + } + } + + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); + + if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, + TRI, *MRI)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); + + Changed = true; + } + } + } + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 5f89f3826683..e5e65a8dbbf1 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -39,7 +39,7 @@ private: MCRegister CondReg; MCRegister ExecReg; - Register optimizeVcndVcmpPair(MachineBasicBlock &MBB); + bool optimizeVcndVcmpPair(MachineBasicBlock &MBB); bool optimizeElseBranch(MachineBasicBlock &MBB); public: @@ -90,8 +90,8 @@ static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx, static bool isDefBetween(const SIRegisterInfo &TRI, LiveIntervals *LIS, Register Reg, const MachineInstr &Sel, const MachineInstr &And) { - SlotIndex AndIdx = LIS->getInstructionIndex(And); - SlotIndex SelIdx = LIS->getInstructionIndex(Sel); + SlotIndex AndIdx = LIS->getInstructionIndex(And).getRegSlot(); + SlotIndex SelIdx = LIS->getInstructionIndex(Sel).getRegSlot(); if (Reg.isVirtual()) return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx); @@ -119,21 +119,20 @@ static bool isDefBetween(const SIRegisterInfo &TRI, // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive // lanes. // -// Returns %cc register on success. -Register -SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { +// Returns true on success. +bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return Opc == AMDGPU::S_CBRANCH_VCCZ || Opc == AMDGPU::S_CBRANCH_VCCNZ; }); if (I == MBB.terminators().end()) - return Register(); + return false; auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *I, *MRI, LIS); if (!And || And->getOpcode() != AndOpc || !And->getOperand(1).isReg() || !And->getOperand(2).isReg()) - return Register(); + return false; MachineOperand *AndCC = &And->getOperand(1); Register CmpReg = AndCC->getReg(); @@ -143,49 +142,49 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { CmpReg = AndCC->getReg(); CmpSubReg = AndCC->getSubReg(); } else if (And->getOperand(2).getReg() != Register(ExecReg)) { - return Register(); + return false; } auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, *MRI, LIS); if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 || Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) || Cmp->getParent() != And->getParent()) - return Register(); + return false; MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0); MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1); if (Op1->isImm() && Op2->isReg()) std::swap(Op1, Op2); if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1) - return Register(); + return false; Register SelReg = Op1->getReg(); auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) - return Register(); + return false; if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) || TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers)) - return Register(); + return false; Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0); Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1); MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2); if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() || Op1->getImm() != 0 || Op2->getImm() != 1) - return Register(); + return false; Register CCReg = CC->getReg(); // If there was a def between the select and the and, we would need to move it // to fold this. if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And)) - return Register(); + return false; + // TODO: Guard against implicit def operands? LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t' << *And); - LIS->RemoveMachineInstrFromMaps(*And); MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc), And->getOperand(0).getReg()) @@ -196,34 +195,92 @@ SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { MachineOperand &Andn2SCC = Andn2->getOperand(3); assert(Andn2SCC.getReg() == AMDGPU::SCC); Andn2SCC.setIsDead(AndSCC.isDead()); + + SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*And, *Andn2); And->eraseFromParent(); - LIS->InsertMachineInstrInMaps(*Andn2); LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n'); + SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp); + SlotIndex SelIdx = LIS->getInstructionIndex(*Sel); + + LiveInterval *CmpLI = + CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr; + LiveInterval *SelLI = + SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr; + + // Update live intervals for CCReg before potentially removing CmpReg/SelReg, + // and their associated liveness information. + if (CCReg.isVirtual()) { + // Note: this ignores that SelLI might have multiple internal values + // or splits and simply extends the live range to cover all cases + // where the result of the v_cndmask_b32 was live (e.g. loops). + // This could yield worse register allocation in rare edge cases. + SlotIndex EndIdx = AndIdx.getRegSlot(); + if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock()) + EndIdx = SelLI->endIndex(); + + LiveInterval &CCLI = LIS->getInterval(CCReg); + auto CCQ = CCLI.Query(SelIdx.getRegSlot()); + if (CCQ.valueIn()) { + CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(), + EndIdx, CCQ.valueIn())); + } + + if (CC->getSubReg()) { + LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg()); + BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); + CCLI.refineSubRanges( + Allocator, Mask, + [=](LiveInterval::SubRange &SR) { + auto CCQS = SR.Query(SelIdx.getRegSlot()); + if (CCQS.valueIn()) { + SR.addSegment(LiveRange::Segment( + SelIdx.getRegSlot(), EndIdx, CCQS.valueIn())); + } + }, + *LIS->getSlotIndexes(), *TRI); + CCLI.removeEmptySubRanges(); + + SmallVector<LiveInterval *> SplitLIs; + LIS->splitSeparateComponents(CCLI, SplitLIs); + } + } else + LIS->removeAllRegUnitsForPhysReg(CCReg); + // Try to remove compare. Cmp value should not used in between of cmp // and s_and_b64 if VCC or just unused if any other register. - if ((CmpReg.isVirtual() && MRI->use_nodbg_empty(CmpReg)) || + if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) || (CmpReg == Register(CondReg) && std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), [&](const MachineInstr &MI) { return MI.readsRegister(CondReg, TRI); }))) { LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n'); - + if (CmpLI) + LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot()); LIS->RemoveMachineInstrFromMaps(*Cmp); Cmp->eraseFromParent(); // Try to remove v_cndmask_b32. - if (SelReg.isVirtual() && MRI->use_nodbg_empty(SelReg)) { - LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); + if (SelLI) { + bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill(); + if (!CanRemoveSel) { + // Try to shrink the live interval and check for dead def instead. + LIS->shrinkToUses(SelLI, nullptr); + CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); + } + if (CanRemoveSel) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); - LIS->RemoveMachineInstrFromMaps(*Sel); - Sel->eraseFromParent(); + LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); + LIS->RemoveMachineInstrFromMaps(*Sel); + Sel->eraseFromParent(); + } } } - return CCReg; + return true; } // Optimize sequence @@ -330,8 +387,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { Changed = true; } - if (Register Reg = optimizeVcndVcmpPair(MBB)) { - RecalcRegs.insert(Reg); + if (optimizeVcndVcmpPair(MBB)) { RecalcRegs.insert(AMDGPU::VCC_LO); RecalcRegs.insert(AMDGPU::VCC_HI); RecalcRegs.insert(AMDGPU::SCC); @@ -402,7 +458,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { } // If the only user of a logical operation is move to exec, fold it now - // to prevent forming of saveexec. I.e: + // to prevent forming of saveexec. I.e.: // // %0:sreg_64 = COPY $exec // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64 diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index e13e33ed5457..2ae3157bab49 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -112,8 +112,10 @@ public: SmallVectorImpl<Register> &CandidateRegs) const; void collectWaterfallCandidateRegisters( - MachineBasicBlock *Loop, - SmallSetVector<Register, 16> &CandidateRegs) const; + MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd, + SmallSetVector<Register, 16> &CandidateRegs, + SmallSetVector<MachineBasicBlock *, 2> &Blocks, + SmallVectorImpl<MachineInstr *> &Instructions) const; void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB, SmallVectorImpl<MachineInstr *> &Uses) const; @@ -131,7 +133,10 @@ public: MachineBasicBlock *Flow, MachineBasicBlock *Endif, SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const; - void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const; + void optimizeWaterfallLiveRange( + Register Reg, MachineBasicBlock *LoopHeader, + SmallSetVector<MachineBasicBlock *, 2> &LoopBlocks, + SmallVectorImpl<MachineInstr *> &Instructions) const; SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {} @@ -323,12 +328,34 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters( /// Collect the registers used in the waterfall loop block that are defined /// before. void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( - MachineBasicBlock *Loop, - SmallSetVector<Register, 16> &CandidateRegs) const { + MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd, + SmallSetVector<Register, 16> &CandidateRegs, + SmallSetVector<MachineBasicBlock *, 2> &Blocks, + SmallVectorImpl<MachineInstr *> &Instructions) const { - for (auto &MI : Loop->instrs()) { - if (MI.isDebugInstr()) - continue; + // Collect loop instructions, potentially spanning multiple blocks + auto *MBB = LoopHeader; + for (;;) { + Blocks.insert(MBB); + for (auto &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Instructions.push_back(&MI); + } + if (MBB == LoopEnd) + break; + + if ((MBB != LoopHeader && MBB->pred_size() != 1) || + (MBB == LoopHeader && MBB->pred_size() != 2) || MBB->succ_size() != 1) { + LLVM_DEBUG(dbgs() << "Unexpected edges in CFG, ignoring loop\n"); + return; + } + + MBB = *MBB->succ_begin(); + } + + for (auto *I : Instructions) { + auto &MI = *I; for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg() || MO.isDef()) @@ -340,16 +367,17 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( continue; if (MO.readsReg()) { - const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); + MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent(); // Make sure the value is defined before the LOOP block - if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) { + if (!Blocks.contains(DefMBB) && !CandidateRegs.contains(MOReg)) { // If the variable is used after the loop, the register coalescer will // merge the newly created register and remove the phi node again. // Just do nothing in that case. LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg); bool IsUsed = false; - for (auto *Succ : Loop->successors()) { - if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { + for (auto *Succ : LoopEnd->successors()) { + if (!Blocks.contains(Succ) && + OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) { IsUsed = true; break; } @@ -513,7 +541,9 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange( } void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( - Register Reg, MachineBasicBlock *Loop) const { + Register Reg, MachineBasicBlock *LoopHeader, + SmallSetVector<MachineBasicBlock *, 2> &Blocks, + SmallVectorImpl<MachineInstr *> &Instructions) const { // Insert a new PHI, marking the value from the last loop iteration undef. LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n'); const auto *RC = MRI->getRegClass(Reg); @@ -525,15 +555,16 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) { auto *UseMI = O.getParent(); auto *UseBlock = UseMI->getParent(); - // Replace uses in Loop block - if (UseBlock == Loop) + // Replace uses in Loop blocks + if (Blocks.contains(UseBlock)) O.setReg(NewReg); } - MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(), - TII->get(TargetOpcode::PHI), NewReg); - for (auto *Pred : Loop->predecessors()) { - if (Pred == Loop) + MachineInstrBuilder PHI = + BuildMI(*LoopHeader, LoopHeader->getFirstNonPHI(), DebugLoc(), + TII->get(TargetOpcode::PHI), NewReg); + for (auto *Pred : LoopHeader->predecessors()) { + if (Blocks.contains(Pred)) PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred); else PHI.addReg(Reg).addMBB(Pred); @@ -542,21 +573,36 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange( LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg); LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); - // collectWaterfallCandidateRegisters only collects registers that are dead - // after the loop. So we know that the old reg is not live throughout the - // whole block anymore. - OldVarInfo.AliveBlocks.reset(Loop->getNumber()); - - // Mark the last use as kill - for (auto &MI : reverse(Loop->instrs())) { - if (MI.readsRegister(NewReg, TRI)) { - MI.addRegisterKilled(NewReg, TRI); - NewVarInfo.Kills.push_back(&MI); + // Find last use and mark as kill + MachineInstr *Kill = nullptr; + for (auto *MI : reverse(Instructions)) { + if (MI->readsRegister(NewReg, TRI)) { + MI->addRegisterKilled(NewReg, TRI); + NewVarInfo.Kills.push_back(MI); + Kill = MI; break; } } - assert(!NewVarInfo.Kills.empty() && - "Failed to find last usage of register in loop"); + assert(Kill && "Failed to find last usage of register in loop"); + + MachineBasicBlock *KillBlock = Kill->getParent(); + bool PostKillBlock = false; + for (auto *Block : Blocks) { + auto BBNum = Block->getNumber(); + + // collectWaterfallCandidateRegisters only collects registers that are dead + // after the loop. So we know that the old reg is no longer live throughout + // the waterfall loop. + OldVarInfo.AliveBlocks.reset(BBNum); + + // The new register is live up to (and including) the block that kills it. + PostKillBlock |= (Block == KillBlock); + if (PostKillBlock) { + NewVarInfo.AliveBlocks.reset(BBNum); + } else if (Block != LoopHeader) { + NewVarInfo.AliveBlocks.set(BBNum); + } + } } char SIOptimizeVGPRLiveRange::ID = 0; @@ -601,6 +647,10 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { if (!Endif) continue; + // Skip unexpected control flow. + if (!MDT->dominates(&MBB, IfTarget) || !MDT->dominates(IfTarget, Endif)) + continue; + SmallSetVector<MachineBasicBlock *, 16> ElseBlocks; SmallVector<Register> CandidateRegs; @@ -620,15 +670,22 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { for (auto Reg : CandidateRegs) optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks); } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) { + auto *LoopHeader = MI.getOperand(0).getMBB(); + auto *LoopEnd = &MBB; + LLVM_DEBUG(dbgs() << "Checking Waterfall loop: " - << printMBBReference(MBB) << '\n'); + << printMBBReference(*LoopHeader) << '\n'); SmallSetVector<Register, 16> CandidateRegs; - collectWaterfallCandidateRegisters(&MBB, CandidateRegs); + SmallVector<MachineInstr *, 16> Instructions; + SmallSetVector<MachineBasicBlock *, 2> Blocks; + + collectWaterfallCandidateRegisters(LoopHeader, LoopEnd, CandidateRegs, + Blocks, Instructions); MadeChange |= !CandidateRegs.empty(); // Now we are safe to optimize. for (auto Reg : CandidateRegs) - optimizeWaterfallLiveRange(Reg, &MBB); + optimizeWaterfallLiveRange(Reg, LoopHeader, Blocks, Instructions); } } } diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index da41a5e2478a..e768a2f3e1a5 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -316,7 +316,7 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, } if (Abs || Neg) { assert(!Sext && - "Float and integer src modifiers can't be set simulteniously"); + "Float and integer src modifiers can't be set simultaneously"); Mods |= Abs ? SISrcMods::ABS : 0u; Mods ^= Neg ? SISrcMods::NEG : 0u; } else if (Sext) { @@ -1131,16 +1131,16 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, bool Converted = false; for (auto &Operand : SDWAOperands) { LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); - // There should be no intesection between SDWA operands and potential MIs + // There should be no intersection between SDWA operands and potential MIs // e.g.: // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 // v_add_u32 v3, v4, v2 // - // In that example it is possible that we would fold 2nd instruction into 3rd - // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was - // already destroyed). So if SDWAOperand is also a potential MI then do not - // apply it. + // In that example it is possible that we would fold 2nd instruction into + // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that + // was already destroyed). So if SDWAOperand is also a potential MI then do + // not apply it. if (PotentialMatches.count(Operand->getParentInst()) == 0) Converted |= Operand->convertToSDWA(*SDWAInst, TII); } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index c2e2875ed6bf..4fab13bb44b1 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -18,7 +18,10 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -85,9 +88,6 @@ FunctionPass *llvm::createSIPreAllocateWWMRegsPass() { } bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { - if (!MO.isReg()) - return false; - Register Reg = MO.getReg(); if (Reg.isPhysical()) return false; @@ -111,7 +111,6 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) { } llvm_unreachable("physreg not found for WWM expression"); - return false; } void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { @@ -142,7 +141,6 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { } SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo &FrameInfo = MF.getFrameInfo(); for (unsigned Reg : RegsToRewrite) { LIS->removeInterval(Reg); @@ -150,18 +148,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { const Register PhysReg = VRM->getPhys(Reg); assert(PhysReg != 0); - // Check if PhysReg is already reserved - if (!MFI->WWMReservedRegs.count(PhysReg)) { - Optional<int> FI; - if (!MFI->isEntryFunction()) { - // Create a stack object for a possible spill in the function prologue. - // Note: Non-CSR VGPR also need this as we may overwrite inactive lanes. - const TargetRegisterClass *RC = TRI->getPhysRegClass(PhysReg); - FI = FrameInfo.CreateSpillStackObject(TRI->getSpillSize(*RC), - TRI->getSpillAlign(*RC)); - } - MFI->reserveWWMRegister(PhysReg, FI); - } + MFI->reserveWWMRegister(PhysReg); } RegsToRewrite.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index b0e45dd3e3e3..8d33b8a1fd4b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -74,6 +74,15 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { // We end up with this pattern sometimes after basic block placement. // It happens while combining a block which assigns -1 or 0 to a saved mask // and another block which consumes that saved mask and then a branch. + // + // While searching this also performs the following substitution: + // vcc = V_CMP + // vcc = S_AND exec, vcc + // S_CBRANCH_VCC[N]Z + // => + // vcc = V_CMP + // S_CBRANCH_VCC[N]Z + bool Changed = false; MachineBasicBlock &MBB = *MI.getParent(); const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); @@ -121,19 +130,32 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { SReg = Op2.getReg(); auto M = std::next(A); bool ReadsSreg = false; + bool ModifiesExec = false; for (; M != E; ++M) { if (M->definesRegister(SReg, TRI)) break; if (M->modifiesRegister(SReg, TRI)) return Changed; ReadsSreg |= M->readsRegister(SReg, TRI); + ModifiesExec |= M->modifiesRegister(ExecReg, TRI); + } + if (M == E) + return Changed; + // If SReg is VCC and SReg definition is a VALU comparison. + // This means S_AND with EXEC is not required. + // Erase the S_AND and return. + // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS + if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec && + TII->isVOPC(*M)) { + A->eraseFromParent(); + return true; } - if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || + if (!M->isMoveImmediate() || !M->getOperand(1).isImm() || (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) return Changed; MaskValue = M->getOperand(1).getImm(); // First if sreg is only used in the AND instruction fold the immediate - // into into the AND. + // into the AND. if (!ReadsSreg && Op2.isKill()) { A->getOperand(2).ChangeToImmediate(MaskValue); M->eraseFromParent(); @@ -213,7 +235,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); } - MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); MI.addImplicitDefUseOperands(*MBB.getParent()); return true; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 21aed4ececb5..ad1455ed20fd 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -19,7 +19,9 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; @@ -182,6 +184,16 @@ struct SGPRSpillBuilder { TmpVGPRLive = true; } + if (TmpVGPRLive) { + // We need to inform the scavenger that this index is already in use until + // we're done with the custom emergency spill. + RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR); + } + + // We may end up recursively calling the scavenger, and don't want to re-use + // the same register. + RS->setRegUsed(TmpVGPR); + // Try to scavenge SGPRs to save exec assert(!SavedExecReg && "Exec is already saved, refuse to save again"); const TargetRegisterClass &RC = @@ -202,6 +214,12 @@ struct SGPRSpillBuilder { // Spill needed lanes TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); } else { + // The modify and restore of exec clobber SCC, which we would have to save + // and restore. FIXME: We probably would need to reserve a register for + // this. + if (RS->isRegUsed(AMDGPU::SCC)) + MI->emitError("unhandled SGPR spill to memory"); + // Spill active lanes if (TmpVGPRLive) TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false, @@ -251,6 +269,12 @@ struct SGPRSpillBuilder { if (TmpVGPRLive) TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); } + + // Inform the scavenger where we're releasing our custom scavenged register. + if (TmpVGPRLive) { + MachineBasicBlock::iterator RestorePt = std::prev(MI); + RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt); + } } // Write TmpVGPR to memory or read TmpVGPR from memory. @@ -265,6 +289,12 @@ struct SGPRSpillBuilder { // Spill needed lanes TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); } else { + // The modify and restore of exec clobber SCC, which we would have to save + // and restore. FIXME: We probably would need to reserve a register for + // this. + if (RS->isRegUsed(AMDGPU::SCC)) + MI->emitError("unhandled SGPR spill to memory"); + // Spill active lanes TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, /*IsKill*/ false); @@ -329,7 +359,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) static auto InitializeSubRegFromChannelTableOnce = [this]() { for (auto &Row : SubRegFromChannelTable) Row.fill(AMDGPU::NoSubRegister); - for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { + for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) { unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32; unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32; assert(Width < SubRegFromChannelTableWidthMap.size()); @@ -364,13 +394,11 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() - ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList - : CSR_AMDGPU_HighRegs_SaveList; + return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList + : CSR_AMDGPU_SaveList; case CallingConv::AMDGPU_Gfx: - return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts() - ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList - : CSR_AMDGPU_SI_Gfx_SaveList; + return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList + : CSR_AMDGPU_SI_Gfx_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -390,13 +418,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() - ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask - : CSR_AMDGPU_HighRegs_RegMask; + return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask + : CSR_AMDGPU_RegMask; case CallingConv::AMDGPU_Gfx: - return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts() - ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask - : CSR_AMDGPU_SI_Gfx_RegMask; + return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask + : CSR_AMDGPU_SI_Gfx_RegMask; default: return nullptr; } @@ -413,8 +439,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, // equivalent AV class. If used one, the verifier will crash after // RegBankSelect in the GISel flow. The aligned regclasses are not fully given // until Instruction selection. - if (MF.getSubtarget<GCNSubtarget>().hasMAIInsts() && - (isVGPRClass(RC) || isAGPRClass(RC))) { + if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) { if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) return &AMDGPU::AV_32RegClass; if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) @@ -463,8 +488,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, } Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const SIFrameLowering *TFI = - MF.getSubtarget<GCNSubtarget>().getFrameLowering(); + const SIFrameLowering *TFI = ST.getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); // During ISel lowering we always reserve the stack pointer in entry // functions, but never actually want to reference it when accessing our own @@ -487,19 +511,19 @@ bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { - return CSR_AMDGPU_AllVGPRs_RegMask; + return AMDGPU_AllVGPRs_RegMask; } const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { - return CSR_AMDGPU_AllAGPRs_RegMask; + return AMDGPU_AllAGPRs_RegMask; } const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { - return CSR_AMDGPU_AllVectorRegs_RegMask; + return AMDGPU_AllVectorRegs_RegMask; } const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { - return CSR_AMDGPU_AllAllocatableSRegs_RegMask; + return AMDGPU_AllAllocatableSRegs_RegMask; } unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, @@ -522,6 +546,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::MODE); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // Reserve special purpose registers. + // // EXEC_LO and EXEC_HI could be allocated and used as regular register, but // this seems likely to result in bugs, so I'm marking them as reserved. reserveRegisterTuples(Reserved, AMDGPU::EXEC); @@ -563,7 +591,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); // Reserve null register - it shall never be allocated - reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL); + reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); // Disallow vcc_hi allocation in wave32. It may be allocated but most likely // will result in bugs. @@ -572,6 +600,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::VCC_HI); } + // Reserve SGPRs. + // unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) { @@ -579,39 +609,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); - unsigned MaxNumAGPRs = MaxNumVGPRs; - unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - - if (ST.hasGFX90AInsts()) { - // In an entry function without calls and AGPRs used it is possible to use - // the whole register budget for VGPRs. - - // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and - // split register file accordingly. - if (MFI->usesAGPRs(MF)) { - MaxNumVGPRs /= 2; - MaxNumAGPRs = MaxNumVGPRs; - } else { - if (MaxNumVGPRs > TotalNumVGPRs) { - MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; - MaxNumVGPRs = TotalNumVGPRs; - } else - MaxNumAGPRs = 0; - } - } - - for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { - unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } - - for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { - unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } - for (auto Reg : AMDGPU::SReg_32RegClass) { Reserved.set(getSubReg(Reg, AMDGPU::hi16)); Register Low = getSubReg(Reg, AMDGPU::lo16); @@ -620,22 +617,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(Low); } - for (auto Reg : AMDGPU::AGPR_32RegClass) { - Reserved.set(getSubReg(Reg, AMDGPU::hi16)); - } - - // Reserve all the rest AGPRs if there are no instructions to use it. - if (!ST.hasMAIInsts()) { - for (unsigned i = 0; i < MaxNumVGPRs; ++i) { - unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } - } - Register ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { - // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need - // to spill. + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we + // need to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); } @@ -644,7 +629,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // which is detected after the function is lowered. If we aren't really going // to need SP, don't bother reserving it. MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); - if (StackPtrReg) { reserveRegisterTuples(Reserved, StackPtrReg); assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); @@ -662,20 +646,63 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } - for (auto Reg : MFI->WWMReservedRegs) { - reserveRegisterTuples(Reserved, Reg.first); + // Reserve VGPRs/AGPRs. + // + unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxNumAGPRs = MaxNumVGPRs; + unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + + // Reserve all the AGPRs if there are no instructions to use it. + if (!ST.hasMAIInsts()) { + for (unsigned i = 0; i < MaxNumAGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } } - // Reserve VGPRs used for SGPR spilling. - // Note we treat freezeReservedRegs unusually because we run register - // allocation in two phases. It's OK to re-freeze with new registers for the - // second run. -#if 0 - for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { - for (auto &SpilledVGPR : SpilledFI.second) - reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); + for (auto Reg : AMDGPU::AGPR_32RegClass) { + Reserved.set(getSubReg(Reg, AMDGPU::hi16)); + } + + // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, + // a wave may have up to 512 total vector registers combining together both + // VGPRs and AGPRs. Hence, in an entry function without calls and without + // AGPRs used within it, it is possible to use the whole vector register + // budget for VGPRs. + // + // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (ST.hasGFX90AInsts()) { + if (MFI->usesAGPRs(MF)) { + MaxNumVGPRs /= 2; + MaxNumAGPRs = MaxNumVGPRs; + } else { + if (MaxNumVGPRs > TotalNumVGPRs) { + MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs; + MaxNumVGPRs = TotalNumVGPRs; + } else + MaxNumAGPRs = 0; + } } -#endif + + for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + + for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + + // On GFX908, in order to guarantee copying between AGPRs, we need a scratch + // VGPR available at all times. + if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { + reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy()); + } + + for (Register Reg : MFI->WWMReservedRegs) + reserveRegisterTuples(Reserved, Reg); // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) @@ -690,6 +717,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF, + MCRegister PhysReg) const { + return !MF.getRegInfo().isReserved(PhysReg); +} + bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); // On entry, the base address is 0, so it can't possibly need any more @@ -1010,6 +1042,8 @@ static int getOffsetMUBUFStore(unsigned Opc) { return AMDGPU::BUFFER_STORE_SHORT_OFFSET; case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET; case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: @@ -1035,6 +1069,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) { return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET; case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN: return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET; case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: @@ -1054,6 +1090,64 @@ static int getOffsetMUBUFLoad(unsigned Opc) { } } +static int getOffenMUBUFStore(unsigned Opc) { + switch (Opc) { + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return AMDGPU::BUFFER_STORE_DWORD_OFFEN; + case AMDGPU::BUFFER_STORE_BYTE_OFFSET: + return AMDGPU::BUFFER_STORE_BYTE_OFFEN; + case AMDGPU::BUFFER_STORE_SHORT_OFFSET: + return AMDGPU::BUFFER_STORE_SHORT_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; + case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET: + return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN; + case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET: + return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN; + default: + return -1; + } +} + +static int getOffenMUBUFLoad(unsigned Opc) { + switch (Opc) { + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + return AMDGPU::BUFFER_LOAD_DWORD_OFFEN; + case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET: + return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN; + case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET: + return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN; + case AMDGPU::BUFFER_LOAD_USHORT_OFFSET: + return AMDGPU::BUFFER_LOAD_USHORT_OFFEN; + case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET: + return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET: + return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET: + return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET: + return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN; + case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET: + return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN; + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: + return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN; + default: + return -1; + } +} + static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -1139,8 +1233,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize) { bool IsStore = TII->get(LoadStoreOp).mayStore(); + bool HasVAddr = AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) != -1; bool UseST = - AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && + !HasVAddr && AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0; switch (EltSize) { @@ -1164,7 +1259,9 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, llvm_unreachable("Unexpected spill load/store size!"); } - if (UseST) + if (HasVAddr) + LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); + else if (UseST) LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); return LoadStoreOp; @@ -1186,6 +1283,7 @@ void SIRegisterInfo::buildSpillLoadStore( bool IsStore = Desc->mayStore(); bool IsFlat = TII->isFLATScratch(LoadStoreOp); + bool CanClobberSCC = false; bool Scavenged = false; MCRegister SOffset = ScratchOffsetReg; @@ -1202,6 +1300,8 @@ void SIRegisterInfo::buildSpillLoadStore( unsigned RemSize = RegWidth - Size; unsigned NumRemSubRegs = RemSize ? 1 : 0; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); + int64_t MaterializedOffset = Offset; + int64_t MaxOffset = Offset + Size + RemSize - EltSize; int64_t ScratchOffsetRegDelta = 0; @@ -1216,6 +1316,42 @@ void SIRegisterInfo::buildSpillLoadStore( assert((IsFlat || ((Offset % EltSize) == 0)) && "unexpected VGPR spill offset"); + // Track a VGPR to use for a constant offset we need to materialize. + Register TmpOffsetVGPR; + + // Track a VGPR to use as an intermediate value. + Register TmpIntermediateVGPR; + bool UseVGPROffset = false; + + // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate + // combination. + auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR, + int64_t VOffset) { + // We are using a VGPR offset + if (IsFlat && SGPRBase) { + // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free + // SGPR, so perform the add as vector. + // We don't need a base SGPR in the kernel. + + if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR) + .addReg(SGPRBase) + .addImm(VOffset) + .addImm(0); // clamp + } else { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(SGPRBase); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR) + .addImm(VOffset) + .addReg(TmpOffsetVGPR); + } + } else { + assert(TmpOffsetVGPR); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addImm(VOffset); + } + }; + bool IsOffsetLegal = IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch) @@ -1223,17 +1359,17 @@ void SIRegisterInfo::buildSpillLoadStore( if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) { SOffset = MCRegister(); - // We currently only support spilling VGPRs to EltSize boundaries, meaning - // we can simplify the adjustment of Offset here to just scale with - // WavefrontSize. - if (!IsFlat) - Offset *= ST.getWavefrontSize(); - // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case. + // TODO: Clobbering SCC is not necessary for scratch instructions in the + // entry. if (RS) { SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); + + // Piggy back on the liveness scan we just did see if SCC is dead. + CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC); } else if (LiveRegs) { + CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC); for (MCRegister Reg : AMDGPU::SGPR_32RegClass) { if (LiveRegs->available(MF->getRegInfo(), Reg)) { SOffset = Reg; @@ -1242,7 +1378,26 @@ void SIRegisterInfo::buildSpillLoadStore( } } + if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC) + SOffset = Register(); + if (!SOffset) { + UseVGPROffset = true; + + if (RS) { + TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + } else { + assert(LiveRegs); + for (MCRegister Reg : AMDGPU::VGPR_32RegClass) { + if (LiveRegs->available(MF->getRegInfo(), Reg)) { + TmpOffsetVGPR = Reg; + break; + } + } + } + + assert(TmpOffsetVGPR); + } else if (!SOffset && CanClobberSCC) { // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar @@ -1250,6 +1405,9 @@ void SIRegisterInfo::buildSpillLoadStore( // add the offset directly to the ScratchOffset or StackPtrOffset // register, and then subtract the offset after the spill to return the // register to it's original value. + + // TODO: If we don't have to do an emergency stack slot spill, converting + // to use the VGPR offset is fewer instructions. if (!ScratchOffsetReg) ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); SOffset = ScratchOffsetReg; @@ -1258,12 +1416,22 @@ void SIRegisterInfo::buildSpillLoadStore( Scavenged = true; } - if (!SOffset) + // We currently only support spilling VGPRs to EltSize boundaries, meaning + // we can simplify the adjustment of Offset here to just scale with + // WavefrontSize. + if (!IsFlat && !UseVGPROffset) + Offset *= ST.getWavefrontSize(); + + if (!UseVGPROffset && !SOffset) report_fatal_error("could not scavenge SGPR to spill in entry function"); - if (ScratchOffsetReg == AMDGPU::NoRegister) { + if (UseVGPROffset) { + // We are using a VGPR offset + MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset); + } else if (ScratchOffsetReg == AMDGPU::NoRegister) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); } else { + assert(Offset != 0); auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) .addReg(ScratchOffsetReg) .addImm(Offset); @@ -1277,13 +1445,16 @@ void SIRegisterInfo::buildSpillLoadStore( assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 && "Unexpected vaddr for flat scratch with a FI operand"); - assert(ST.hasFlatScratchSTMode()); - LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); + if (UseVGPROffset) { + LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp); + } else { + assert(ST.hasFlatScratchSTMode()); + LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp); + } + Desc = &TII->get(LoadStoreOp); } - Register TmpReg; - for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; ++i, RegOffset += EltSize) { if (i == NumSubRegs) { @@ -1292,6 +1463,22 @@ void SIRegisterInfo::buildSpillLoadStore( } Desc = &TII->get(LoadStoreOp); + if (!IsFlat && UseVGPROffset) { + int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp) + : getOffenMUBUFLoad(LoadStoreOp); + Desc = &TII->get(NewLoadStoreOp); + } + + if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) { + // If we are spilling an AGPR beyond the range of the memory instruction + // offset and need to use a VGPR offset, we ideally have at least 2 + // scratch VGPRs. If we don't have a second free VGPR without spilling, + // recycle the VGPR used for the offset which requires resetting after + // each subregister. + + MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset); + } + unsigned NumRegs = EltSize / 4; Register SubReg = e == 1 ? ValueReg @@ -1300,7 +1487,8 @@ void SIRegisterInfo::buildSpillLoadStore( unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); - if (i + 1 == e) { + const bool IsLastSubReg = i + 1 == e; + if (IsLastSubReg) { SOffsetRegState |= getKillRegState(Scavenged); // The last implicit use carries the "Kill" flag. SrcDstRegState |= getKillRegState(IsKill); @@ -1363,21 +1551,26 @@ void SIRegisterInfo::buildSpillLoadStore( if (IsAGPR) { assert(EltSize == 4); - if (!TmpReg) { - assert(RS && "Needs to have RegScavenger to spill an AGPR!"); - // FIXME: change to scavengeRegisterBackwards() - TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - RS->setRegUsed(TmpReg); + if (!TmpIntermediateVGPR) { + TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy(); + assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR)); } if (IsStore) { auto AccRead = BuildMI(MBB, MI, DL, - TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg) + TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), + TmpIntermediateVGPR) .addReg(SubReg, getKillRegState(IsKill)); if (NeedSuperRegDef) AccRead.addReg(ValueReg, RegState::ImplicitDefine); AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse); } - SubReg = TmpReg; + SubReg = TmpIntermediateVGPR; + } else if (UseVGPROffset) { + // FIXME: change to scavengeRegisterBackwards() + if (!TmpOffsetVGPR) { + TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + RS->setRegUsed(TmpOffsetVGPR); + } } MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); @@ -1388,12 +1581,26 @@ void SIRegisterInfo::buildSpillLoadStore( auto MIB = BuildMI(MBB, MI, DL, *Desc) .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); + + if (UseVGPROffset) { + // For an AGPR spill, we reuse the same temp VGPR for the offset and the + // intermediate accvgpr_write. + MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR)); + } + if (!IsFlat) MIB.addReg(FuncInfo->getScratchRSrcReg()); if (SOffset == AMDGPU::NoRegister) { - if (!IsFlat) - MIB.addImm(0); + if (!IsFlat) { + if (UseVGPROffset && ScratchOffsetReg) { + assert(!FuncInfo->isEntryFunction()); + MIB.addReg(ScratchOffsetReg); + } else { + assert(FuncInfo->isEntryFunction()); + MIB.addImm(0); + } + } } else { MIB.addReg(SOffset, SOffsetRegState); } @@ -1407,10 +1614,10 @@ void SIRegisterInfo::buildSpillLoadStore( if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); - if (!IsStore && TmpReg != AMDGPU::NoRegister) { + if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) { MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), FinalReg) - .addReg(TmpReg, RegState::Kill); + .addReg(TmpIntermediateVGPR, RegState::Kill); MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); } @@ -1466,8 +1673,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = - SB.MFI.getSGPRToVGPRSpills(Index); + ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1485,7 +1691,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, SB.NumSubRegs == 1 ? SB.SuperReg : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); - SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + SpilledReg Spill = VGPRSpills[i]; bool UseKill = SB.IsKill && i == SB.NumSubRegs - 1; @@ -1586,8 +1792,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = - SB.MFI.getSGPRToVGPRSpills(Index); + ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRToVGPRSpills(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1599,7 +1804,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, ? SB.SuperReg : Register(getSubReg(SB.SuperReg, SB.SplitParts[i])); - SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + SpilledReg Spill = VGPRSpills[i]; auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) @@ -1937,18 +2142,23 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Offset = 0; } - assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) && - "Unexpected vaddr for flat scratch with a FI operand"); - - // On GFX10 we have ST mode to use no registers for an address. - // Otherwise we need to materialize 0 into an SGPR. - if (!Offset && ST.hasFlatScratchSTMode()) { + if (!Offset) { unsigned Opc = MI->getOpcode(); - unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); - MI->RemoveOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); - MI->setDesc(TII->get(NewOpc)); - return; + int NewOpc = -1; + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) != -1) { + NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc); + } else if (ST.hasFlatScratchSTMode()) { + // On GFX10 we have ST mode to use no registers for an address. + // Otherwise we need to materialize 0 into an SGPR. + NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc); + } + + if (NewOpc != -1) { + MI->removeOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr)); + MI->setDesc(TII->get(NewOpc)); + return; + } } } @@ -2026,57 +2236,78 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (!IsMUBUF && !MFI->isEntryFunction()) { // Convert to a swizzled stack address by scaling by the wave size. - // // In an entry function/kernel the offset is already swizzled. - - bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = - IsCopy ? MI->getOperand(0).getReg() - : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum)); + bool LiveSCC = RS->isRegUsed(AMDGPU::SCC); + const TargetRegisterClass *RC = IsSALU && !LiveSCC + ? &AMDGPU::SReg_32RegClass + : &AMDGPU::VGPR_32RegClass; + bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || + MI->getOpcode() == AMDGPU::V_MOV_B32_e64; + Register ResultReg = IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(RC, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { + unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 + : AMDGPU::V_LSHRREV_B32_e64; // XXX - This never happens because of emergency scavenging slot at 0? - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); + auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); + if (IsSALU && !LiveSCC) + Shift.getInstr()->getOperand(3).setIsDead( + true); // Mark SCC as dead. + if (IsSALU && LiveSCC) { + Register NewDest = + RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + NewDest) + .addReg(ResultReg); + ResultReg = NewDest; + } } else { - if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - // Reuse ResultReg in intermediate step. - Register ScaledReg = ResultReg; + MachineInstrBuilder MIB; + if (!IsSALU) { + if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) != + nullptr) { + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), - ScaledReg) - .addImm(ST.getWavefrontSizeLog2()) - .addReg(FrameReg); + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), + ScaledReg) + .addImm(ST.getWavefrontSizeLog2()) + .addReg(FrameReg); - const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; + const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; - // TODO: Fold if use instruction is another add of a constant. - if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - // FIXME: This can fail - MIB.addImm(Offset); - MIB.addReg(ScaledReg, RegState::Kill); - if (!IsVOP2) - MIB.addImm(0); // clamp bit - } else { - assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && - "Need to reuse carry out register"); + // TODO: Fold if use instruction is another add of a constant. + if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { + // FIXME: This can fail + MIB.addImm(Offset); + MIB.addReg(ScaledReg, RegState::Kill); + if (!IsVOP2) + MIB.addImm(0); // clamp bit + } else { + assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 && + "Need to reuse carry out register"); - // Use scavenged unused carry out as offset register. - Register ConstOffsetReg; - if (!isWave32) - ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); - else - ConstOffsetReg = MIB.getReg(1); + // Use scavenged unused carry out as offset register. + Register ConstOffsetReg; + if (!isWave32) + ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0); + else + ConstOffsetReg = MIB.getReg(1); - BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) - .addImm(Offset); - MIB.addReg(ConstOffsetReg, RegState::Kill); - MIB.addReg(ScaledReg, RegState::Kill); - MIB.addImm(0); // clamp bit + BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) + .addImm(Offset); + MIB.addReg(ConstOffsetReg, RegState::Kill); + MIB.addReg(ScaledReg, RegState::Kill); + MIB.addImm(0); // clamp bit + } } - } else { + } + if (!MIB || IsSALU) { // We have to produce a carry out, and there isn't a free SGPR pair // for it. We can keep the whole computation on the SALU to avoid // clobbering an additional register at the cost of an extra mov. @@ -2084,7 +2315,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // We may have 1 free scratch SGPR even though a carry out is // unavailable. Only one additional mov is needed. Register TmpScaledReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) @@ -2093,14 +2324,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) .addReg(ScaledReg, RegState::Kill) .addImm(Offset); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) - .addReg(ScaledReg, RegState::Kill); + if (!IsSALU) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg) + .addReg(ScaledReg, RegState::Kill); + else + ResultReg = ScaledReg; // If there were truly no free SGPRs, we need to undo everything. if (!TmpScaledReg.isValid()) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg) - .addReg(ScaledReg, RegState::Kill) - .addImm(-Offset); + .addReg(ScaledReg, RegState::Kill) + .addImm(-Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); @@ -2665,8 +2899,7 @@ MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const const TargetRegisterClass * SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, - const RegisterBank &RB, - const MachineRegisterInfo &MRI) const { + const RegisterBank &RB) const { switch (RB.getID()) { case AMDGPU::VGPRRegBankID: return getVGPRClassForBitWidth(std::max(32u, Size)); @@ -2688,7 +2921,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>()) - return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI); + return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) return getAllocatableClass(RC); @@ -2808,9 +3041,29 @@ bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const { return true; } +const TargetRegisterClass * +SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const { + if (!RC || !ST.needsAlignedVGPRs()) + return RC; + + unsigned Size = getRegSizeInBits(*RC); + if (Size <= 32) + return RC; + + if (isVGPRClass(RC)) + return getAlignedVGPRClassForBitWidth(Size); + if (isAGPRClass(RC)) + return getAlignedAGPRClassForBitWidth(Size); + if (isVectorSuperClass(RC)) + return getAlignedVectorSuperClassForBitWidth(Size); + + return RC; +} + bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { switch (PhysReg) { case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: case AMDGPU::SRC_SHARED_BASE: case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_SHARED_LIMIT: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index f1fe0a1d9329..9bfbc253410b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -51,6 +51,17 @@ private: public: SIRegisterInfo(const GCNSubtarget &ST); + struct SpilledReg { + Register VGPR; + int Lane = -1; + + SpilledReg() = default; + SpilledReg(Register R, int L) : VGPR(R), Lane(L) {} + + bool hasLane() { return Lane != -1; } + bool hasReg() { return VGPR != 0; } + }; + /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); @@ -64,6 +75,8 @@ public: MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; BitVector getReservedRegs(const MachineFunction &MF) const override; + bool isAsmClobberable(const MachineFunction &MF, + MCRegister PhysReg) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; @@ -304,15 +317,11 @@ public: MCRegister getReturnAddressReg(const MachineFunction &MF) const; const TargetRegisterClass * - getRegClassForSizeOnBank(unsigned Size, - const RegisterBank &Bank, - const MachineRegisterInfo &MRI) const; + getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const; const TargetRegisterClass * - getRegClassForTypeOnBank(LLT Ty, - const RegisterBank &Bank, - const MachineRegisterInfo &MRI) const { - return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank, MRI); + getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const { + return getRegClassForSizeOnBank(Ty.getSizeInBits(), Bank); } const TargetRegisterClass * @@ -377,6 +386,11 @@ public: // the subtarget. bool isProperlyAlignedRC(const TargetRegisterClass &RC) const; + // Given \p RC returns corresponding aligned register class if required + // by the subtarget. + const TargetRegisterClass * + getProperlyAlignedRC(const TargetRegisterClass *RC) const; + /// Return all SGPR128 which satisfy the waves per execution unit requirement /// of the subtarget. ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index eb9452f4b85e..ffe8dce79816 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -97,7 +97,7 @@ class RegSeqNames<int last_reg, int stride, int size, string prefix, []); } -// Generates list of dags for register tupless. +// Generates list of dags for register tuples. class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size, int start = 0> { dag trunc_rc = (trunc RC, @@ -189,7 +189,7 @@ def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> { def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 106; + let HWEncoding = VCC_LO.HWEncoding; } defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>; @@ -198,7 +198,7 @@ defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>; def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 126; + let HWEncoding = EXEC_LO.HWEncoding; } // 32-bit real registers, for MC only. @@ -211,8 +211,23 @@ defm SRC_SCC : SIRegLoHi16<"src_scc", 253>; // Should never be emitted. def SCC : SIReg<"scc">; -defm M0 : SIRegLoHi16 <"m0", 124>; -defm SGPR_NULL : SIRegLoHi16 <"null", 125>; +// Encoding changes between subtarget generations. +// See also Utils/AMDGPUBaseInfo.cpp MAP_REG2REG. +defm M0_gfxpre11 : SIRegLoHi16 <"m0", 124>; +defm M0_gfx11plus : SIRegLoHi16 <"m0", 125>; +defm M0 : SIRegLoHi16 <"m0", 0>; + +defm SGPR_NULL_gfxpre11 : SIRegLoHi16 <"null", 125>; +defm SGPR_NULL_gfx11plus : SIRegLoHi16 <"null", 124>; +defm SGPR_NULL : SIRegLoHi16 <"null", 0>; +defm SGPR_NULL_HI : SIRegLoHi16 <"", 0>; + +def SGPR_NULL64 : + RegisterWithSubRegs<"null", [SGPR_NULL, SGPR_NULL_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = SGPR_NULL.HWEncoding; +} defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>; defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>; @@ -237,7 +252,7 @@ def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; + let HWEncoding = XNACK_MASK_LO.HWEncoding; } // Trap handler registers @@ -247,7 +262,7 @@ defm TBA_HI : SIRegLoHi16<"tba_hi", 109>; def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 108; + let HWEncoding = TBA_LO.HWEncoding; } defm TMA_LO : SIRegLoHi16<"tma_lo", 110>; @@ -256,7 +271,7 @@ defm TMA_HI : SIRegLoHi16<"tma_hi", 111>; def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 110; + let HWEncoding = TMA_LO.HWEncoding; } foreach Index = 0...15 in { @@ -635,16 +650,16 @@ let GeneratePressureSet = 0, HasSGPR = 1 in { // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, - SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, + SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, + SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID, SRC_VCCZ, SRC_EXECZ, SRC_SCC)> { let AllocationPriority = 10; } def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16, - XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16, - TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16, + XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16, + TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16, SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { let Size = 16; @@ -701,23 +716,6 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], let HasSGPR = 1; } -// CCR (call clobbered registers) SGPR 64-bit registers -def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, - (add (trunc SGPR_64, 16))> { - let CopyCost = SGPR_64.CopyCost; - let AllocationPriority = SGPR_64.AllocationPriority; - let HasSGPR = 1; -} - -// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC -def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, - (add (trunc (shl SGPR_64, 15), 1), // s[30:31] - (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63] - let CopyCost = SGPR_64.CopyCost; - let AllocationPriority = SGPR_64.AllocationPriority; - let HasSGPR = 1; -} - def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; @@ -725,7 +723,7 @@ def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, } def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, - (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 13; let HasSGPR = 1; @@ -788,7 +786,7 @@ defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128R defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; @@ -829,7 +827,7 @@ defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; -defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; +defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>; defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; @@ -856,21 +854,12 @@ defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024) } // End GeneratePressureSet = 0 -// This is not a real register. This is just to have a register to add -// to VReg_1 that does not alias any real register that would -// introduce inferred register classes. -def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> { - let isArtificial = 1; -} - let GeneratePressureSet = 0 in { -// FIXME: Should specify an empty set for this. No register should -// ever be allocated using VReg_1. This is a hack for SelectionDAG -// that should always be lowered by SILowerI1Copies. TableGen crashes -// on an empty register set, but also sorts register classes based on -// the number of registerss in them. Add only one register so this is +// No register should ever be allocated using VReg_1. This is a hack for +// SelectionDAG that should always be lowered by SILowerI1Copies. TableGen +// sorts register classes based on the number of registers in them so this is // sorted to the end and not preferred over VGPR_32. -def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> { +def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> { let Size = 1; let HasVGPR = 1; } @@ -913,11 +902,11 @@ defm AV_64 : AVRegClass<2, VReg_64.RegTypes, (add VGPR_64), (add AGPR_64)>; defm AV_96 : AVRegClass<3, VReg_96.RegTypes, (add VGPR_96), (add AGPR_96)>; defm AV_128 : AVRegClass<4, VReg_128.RegTypes, (add VGPR_128), (add AGPR_128)>; defm AV_160 : AVRegClass<5, VReg_160.RegTypes, (add VGPR_160), (add AGPR_160)>; -defm AV_192 : AVRegClass<6, VReg_160.RegTypes, (add VGPR_192), (add AGPR_192)>; -defm AV_224 : AVRegClass<7, VReg_160.RegTypes, (add VGPR_224), (add AGPR_224)>; -defm AV_256 : AVRegClass<8, VReg_160.RegTypes, (add VGPR_256), (add AGPR_256)>; -defm AV_512 : AVRegClass<16, VReg_160.RegTypes, (add VGPR_512), (add AGPR_512)>; -defm AV_1024 : AVRegClass<32, VReg_160.RegTypes, (add VGPR_1024), (add AGPR_1024)>; +defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>; +defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>; +defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>; +defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; +defm AV_1024 : AVRegClass<32, VReg_1024.RegTypes, (add VGPR_1024), (add AGPR_1024)>; //===----------------------------------------------------------------------===// // Register operands @@ -1087,6 +1076,27 @@ def VRegSrc_32 : RegisterOperand<VGPR_32> { let DecoderMethod = "DecodeVS_32RegisterClass"; } +def VRegSrc_64 : RegisterOperand<VReg_64> { + let DecoderMethod = "decodeOperand_VReg_64"; +} + +def VRegSrc_128 : RegisterOperand<VReg_128> { + let DecoderMethod = "decodeOperand_VReg_128"; +} + +def VRegSrc_256 : RegisterOperand<VReg_256> { + let DecoderMethod = "decodeOperand_VReg_256"; +} + +//===----------------------------------------------------------------------===// +// VGPRSrc_* +//===----------------------------------------------------------------------===// + +// An 8-bit RegisterOperand wrapper for a VGPR +def VGPRSrc_32 : RegisterOperand<VGPR_32> { + let DecoderMethod = "DecodeVGPR_32RegisterClass"; +} + //===----------------------------------------------------------------------===// // ASrc_* Operands with an AccVGPR //===----------------------------------------------------------------------===// @@ -1116,7 +1126,7 @@ defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">; defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">; //===----------------------------------------------------------------------===// -// AVSrc_* Operands with an AGPR or VGPR +// AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR //===----------------------------------------------------------------------===// def AVSrc_32 : RegisterOperand<AV_32> { @@ -1129,6 +1139,21 @@ def AVSrc_64 : RegisterOperand<AV_64> { let EncoderMethod = "getAVOperandEncoding"; } +def AVSrc_128 : RegisterOperand<AV_128> { + let DecoderMethod = "DecodeAV_128RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVDst_128 : RegisterOperand<AV_128> { + let DecoderMethod = "DecodeAVDst_128RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVDst_512 : RegisterOperand<AV_512> { + let DecoderMethod = "DecodeAVDst_512RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + def AVLdSt_32 : RegisterOperand<AV_32> { let DecoderMethod = "DecodeAVLdSt_32RegisterClass"; let EncoderMethod = "getAVOperandEncoding"; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 18d424a3bc9f..53441b5a4ced 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -59,6 +59,7 @@ def WriteIntMul : SchedWrite; // mAI multipass instructions. def Write2PassMAI : SchedWrite; +def Write4PassMAI : SchedWrite; def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; def Write4PassDGEMM : SchedWrite; @@ -86,7 +87,9 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; +def SIDPGFX940FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; +def GFX11SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -156,6 +159,8 @@ multiclass SICommonWriteRes { let ResourceCycles = [2] in def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; + let ResourceCycles = [4] in + def : HWWriteRes<Write4PassMAI, [HWXDL], 4>; let ResourceCycles = [8] in def : HWWriteRes<Write8PassMAI, [HWXDL], 8>; let ResourceCycles = [16] in @@ -244,6 +249,40 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; } // End SchedModel = SIDPFullSpeedModel +let SchedModel = SIDPGFX940FullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 1>; +def : HWVALUWriteRes<WriteDoubleAdd, 1>; +def : HWVALUWriteRes<WriteDoubleCvt, 1>; +def : HWVALUWriteRes<WriteTrans64, 4>; +def : HWVALUWriteRes<WriteIntMul, 1>; +def : HWVALUWriteRes<Write64Bit, 1>; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; + +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; + +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; + +} // End SchedModel = SIDPGFX940FullSpeedModel + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). @@ -273,3 +312,29 @@ def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = GFX10SpeedModel + +let SchedModel = GFX11SpeedModel in { + +def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; +def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; +def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; +def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>; +def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>; +def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>; + +def : HWWriteRes<WriteBranch, [HWBranch], 32>; +def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; +def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; +def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; +def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX11SpeedModel diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index c8f1daf26de9..05d2dd000162 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -26,15 +26,40 @@ using namespace llvm; namespace { class SIShrinkInstructions : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const GCNSubtarget *ST; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + public: static char ID; - void shrinkMIMG(MachineInstr &MI); - public: SIShrinkInstructions() : MachineFunctionPass(ID) { } + bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; + bool isKImmOperand(const MachineOperand &Src) const; + bool isKUImmOperand(const MachineOperand &Src) const; + bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; + bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; + void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; + void shrinkScalarCompare(MachineInstr &MI) const; + void shrinkMIMG(MachineInstr &MI) const; + void shrinkMadFma(MachineInstr &MI) const; + bool shrinkScalarLogicOp(MachineInstr &MI) const; + bool tryReplaceDeadSDST(MachineInstr &MI) const; + bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, + Register Reg, unsigned SubReg) const; + bool instReadsReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const; + bool instModifiesReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const; + TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, + unsigned I) const; + void dropInstructionKeepingImpDefs(MachineInstr &MI) const; + MachineInstr *matchSwap(MachineInstr &MovT) const; + bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "SI Shrink Instructions"; } @@ -59,8 +84,8 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() { /// This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. -static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, - MachineRegisterInfo &MRI, bool TryToCommute = true) { +bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, + bool TryToCommute) const { assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -69,8 +94,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { Register Reg = Src0.getReg(); - if (Reg.isVirtual() && MRI.hasOneUse(Reg)) { - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Reg.isVirtual()) { + MachineInstr *Def = MRI->getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; @@ -91,8 +116,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } if (ConstantFolded) { - assert(MRI.use_empty(Reg)); - Def->eraseFromParent(); + if (MRI->use_nodbg_empty(Reg)) + Def->eraseFromParent(); ++NumLiteralConstantsFolded; return true; } @@ -103,7 +128,7 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, // We have failed to fold src0, so commute the instruction and try again. if (TryToCommute && MI.isCommutable()) { if (TII->commuteInstruction(MI)) { - if (foldImmediates(MI, TII, MRI, false)) + if (foldImmediates(MI, false)) return true; // Commute back. @@ -114,21 +139,20 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return false; } -static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { +bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { return isInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), Src.getParent()->getOperandNo(&Src)); } -static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { +bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), Src.getParent()->getOperandNo(&Src)); } -static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, - const MachineOperand &Src, - bool &IsUnsigned) { +bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, + bool &IsUnsigned) const { if (isInt<16>(Src.getImm())) { IsUnsigned = false; return !TII->isInlineConstant(Src); @@ -144,9 +168,8 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, /// \returns true if the constant in \p Src should be replaced with a bitreverse /// of an inline immediate. -static bool isReverseInlineImm(const SIInstrInfo *TII, - const MachineOperand &Src, - int32_t &ReverseImm) { +bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, + int32_t &ReverseImm) const { if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) return false; @@ -156,8 +179,9 @@ static bool isReverseInlineImm(const SIInstrInfo *TII, /// Copy implicit register operands from specified instruction to this /// instruction that are not part of the instruction definition. -static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, - const MachineInstr &MI) { +void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, + MachineInstr &MI) const { + MachineFunction &MF = *MI.getMF(); for (unsigned i = MI.getDesc().getNumOperands() + MI.getDesc().getNumImplicitUses() + MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); @@ -168,7 +192,7 @@ static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, } } -static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { +void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to // get constants on the RHS. if (!MI.getOperand(0).isReg()) @@ -191,7 +215,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { // and initially selected to the unsigned versions. if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { bool HasUImm; - if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { + if (isKImmOrKUImmOperand(Src1, HasUImm)) { if (!HasUImm) { SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; @@ -205,22 +229,30 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { const MCInstrDesc &NewDesc = TII->get(SOPKOpc); - if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || - (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { + if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || + (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { MI.setDesc(NewDesc); } } // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. -void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { +void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info) return; - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + uint8_t NewEncoding; + switch (Info->MIMGEncoding) { + case AMDGPU::MIMGEncGfx10NSA: + NewEncoding = AMDGPU::MIMGEncGfx10Default; + break; + case AMDGPU::MIMGEncGfx11NSA: + NewEncoding = AMDGPU::MIMGEncGfx11Default; + break; + default: + return; + } + int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); unsigned NewAddrDwords = Info->VAddrDwords; @@ -246,16 +278,23 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { } unsigned VgprBase = 0; + unsigned NextVgpr = 0; bool IsUndef = true; bool IsKill = NewAddrDwords == Info->VAddrDwords; - for (unsigned i = 0; i < Info->VAddrDwords; ++i) { - const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); - unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); + for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); + unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); + unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; + assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); - if (i == 0) { + if (Idx == 0) { VgprBase = Vgpr; - } else if (VgprBase + i != Vgpr) + NextVgpr = Vgpr + Dwords; + } else if (Vgpr == NextVgpr) { + NextVgpr = Vgpr + Dwords; + } else { return; + } if (!Op.isUndef()) IsUndef = false; @@ -288,21 +327,108 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { } } - unsigned NewOpcode = - AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, - Info->VDataDwords, NewAddrDwords); + unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, + Info->VDataDwords, NewAddrDwords); MI.setDesc(TII->get(NewOpcode)); MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); MI.getOperand(VAddr0Idx).setIsKill(IsKill); - for (unsigned i = 1; i < Info->VAddrDwords; ++i) - MI.RemoveOperand(VAddr0Idx + 1); + for (int i = 1; i < Info->VAddrOperands; ++i) + MI.removeOperand(VAddr0Idx + 1); if (ToUntie >= 0) { MI.tieOperands( AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), - ToUntie - (Info->VAddrDwords - 1)); + ToUntie - (Info->VAddrOperands - 1)); + } +} + +// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. +void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { + if (!ST->hasVOP3Literal()) + return; + + if (TII->hasAnyModifiersSet(MI)) + return; + + const unsigned Opcode = MI.getOpcode(); + MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; + + bool Swap; + + // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. + if (Src2.isImm() && !TII->isInlineConstant(Src2)) { + if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) + Swap = false; + else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) + Swap = true; + else + return; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected mad/fma opcode!"); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADAK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAAK_F32; + break; + case AMDGPU::V_MAD_F16_e64: + NewOpcode = AMDGPU::V_MADAK_F16; + break; + case AMDGPU::V_FMA_F16_e64: + NewOpcode = AMDGPU::V_FMAAK_F16; + break; + } + } + + // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. + if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { + if (Src1.isImm() && !TII->isInlineConstant(Src1)) + Swap = false; + else if (Src0.isImm() && !TII->isInlineConstant(Src0)) + Swap = true; + else + return; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected mad/fma opcode!"); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADMK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAMK_F32; + break; + case AMDGPU::V_MAD_F16_e64: + NewOpcode = AMDGPU::V_MADMK_F16; + break; + case AMDGPU::V_FMA_F16_e64: + NewOpcode = AMDGPU::V_FMAMK_F16; + break; + } + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) + return; + + if (Swap) { + // Swap Src0 and Src1 by building a new instruction. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), + MI.getOperand(0).getReg()) + .add(Src1) + .add(Src0) + .add(Src2) + .setMIFlags(MI.getFlags()); + MI.eraseFromParent(); + } else { + TII->removeModOperands(MI); + MI.setDesc(TII->get(NewOpcode)); } } @@ -311,10 +437,7 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { /// If the inverse of the immediate is legal, use ANDN2, ORN2 or /// XNOR (as a ^ b == ~(a ^ ~b)). /// \returns true if the caller should continue the machine function iterator -static bool shrinkScalarLogicOp(const GCNSubtarget &ST, - MachineRegisterInfo &MRI, - const SIInstrInfo *TII, - MachineInstr &MI) { +bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); @@ -323,7 +446,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, MachineOperand *SrcImm = Src1; if (!SrcImm->isImm() || - AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) + AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) return false; uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); @@ -333,7 +456,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (isPowerOf2_32(~Imm)) { NewImm = countTrailingOnes(Imm); Opc = AMDGPU::S_BITSET0_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_ANDN2_B32; } @@ -341,12 +464,12 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, if (isPowerOf2_32(Imm)) { NewImm = countTrailingZeros(Imm); Opc = AMDGPU::S_BITSET1_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_ORN2_B32; } } else if (Opc == AMDGPU::S_XOR_B32) { - if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; Opc = AMDGPU::S_XNOR_B32; } @@ -354,16 +477,10 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, llvm_unreachable("unexpected opcode"); } - if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && - SrcImm == Src0) { - if (!TII->commuteInstruction(MI, false, 1, 2)) - NewImm = 0; - } - if (NewImm != 0) { if (Dest->getReg().isVirtual() && SrcReg->isReg()) { - MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); - MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); return true; } @@ -390,19 +507,19 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. -static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, - Register Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { +bool SIShrinkInstructions::instAccessReg( + iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, + unsigned SubReg) const { for (const MachineOperand &MO : R) { if (!MO.isReg()) continue; if (Reg.isPhysical() && MO.getReg().isPhysical()) { - if (TRI.regsOverlap(Reg, MO.getReg())) + if (TRI->regsOverlap(Reg, MO.getReg())) return true; } else if (MO.getReg() == Reg && Reg.isVirtual()) { - LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & - TRI.getSubRegIndexLaneMask(MO.getSubReg()); + LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & + TRI->getSubRegIndexLaneMask(MO.getSubReg()); if (Overlap.any()) return true; } @@ -410,33 +527,31 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, return false; } -static bool instReadsReg(const MachineInstr *MI, - unsigned Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { - return instAccessReg(MI->uses(), Reg, SubReg, TRI); +bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const { + return instAccessReg(MI->uses(), Reg, SubReg); } -static bool instModifiesReg(const MachineInstr *MI, - unsigned Reg, unsigned SubReg, - const SIRegisterInfo &TRI) { - return instAccessReg(MI->defs(), Reg, SubReg, TRI); +bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, + unsigned SubReg) const { + return instAccessReg(MI->defs(), Reg, SubReg); } -static TargetInstrInfo::RegSubRegPair -getSubRegForIndex(Register Reg, unsigned Sub, unsigned I, - const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { - if (TRI.getRegSizeInBits(Reg, MRI) != 32) { +TargetInstrInfo::RegSubRegPair +SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, + unsigned I) const { + if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { if (Reg.isPhysical()) { - Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); + Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); } else { - Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); + Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); } } return TargetInstrInfo::RegSubRegPair(Reg, Sub); } -static void dropInstructionKeepingImpDefs(MachineInstr &MI, - const SIInstrInfo *TII) { +void SIShrinkInstructions::dropInstructionKeepingImpDefs( + MachineInstr &MI) const { for (unsigned i = MI.getDesc().getNumOperands() + MI.getDesc().getNumImplicitUses() + MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); @@ -464,14 +579,13 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI, // Returns next valid instruction pointer if was able to create v_swap_b32. // // This shall not be done too early not to prevent possible folding which may -// remove matched moves, and this should prefereably be done before RA to +// remove matched moves, and this should preferably be done before RA to // release saved registers and also possibly after RA which can insert copies // too. // -// This is really just a generic peephole that is not a canocical shrinking, +// This is really just a generic peephole that is not a canonical shrinking, // although requirements match the pass placement and it reduces code size too. -static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, - const SIInstrInfo *TII) { +MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || MovT.getOpcode() == AMDGPU::COPY); @@ -486,8 +600,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, unsigned Size = TII->getOpSize(MovT, 0) / 4; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (!TRI.isVGPR(MRI, X)) + if (!TRI->isVGPR(*MRI, X)) return nullptr; if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) @@ -501,7 +614,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { MachineInstr *MovY = &*Iter; - KilledT = MovY->killsRegister(T, &TRI); + KilledT = MovY->killsRegister(T, TRI); if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && MovY->getOpcode() != AMDGPU::COPY) || @@ -514,21 +627,20 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, Register Y = MovY->getOperand(0).getReg(); unsigned Ysub = MovY->getOperand(0).getSubReg(); - if (!TRI.isVGPR(MRI, Y)) + if (!TRI->isVGPR(*MRI, Y)) continue; MachineInstr *MovX = nullptr; for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); I != IY; ++I) { - if (instReadsReg(&*I, X, Xsub, TRI) || - instModifiesReg(&*I, Y, Ysub, TRI) || - instModifiesReg(&*I, T, Tsub, TRI) || - (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { + if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || + instModifiesReg(&*I, T, Tsub) || + (MovX && instModifiesReg(&*I, X, Xsub))) { MovX = nullptr; break; } - if (!instReadsReg(&*I, Y, Ysub, TRI)) { - if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { + if (!instReadsReg(&*I, Y, Ysub)) { + if (!MovX && instModifiesReg(&*I, X, Xsub)) { MovX = nullptr; break; } @@ -559,8 +671,8 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, for (unsigned I = 0; I < Size; ++I) { TargetInstrInfo::RegSubRegPair X1, Y1; - X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); - Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); + X1 = getSubRegForIndex(X, Xsub, I); + Y1 = getSubRegForIndex(Y, Ysub, I); MachineBasicBlock &MBB = *MovT.getParent(); auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), TII->get(AMDGPU::V_SWAP_B32)) @@ -570,23 +682,23 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, .addReg(X1.Reg, 0, X1.SubReg).getInstr(); if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { // Drop implicit EXEC. - MIB->RemoveOperand(MIB->getNumExplicitOperands()); + MIB->removeOperand(MIB->getNumExplicitOperands()); MIB->copyImplicitOps(*MBB.getParent(), *MovX); } } MovX->eraseFromParent(); - dropInstructionKeepingImpDefs(*MovY, TII); + dropInstructionKeepingImpDefs(*MovY); MachineInstr *Next = &*std::next(MovT.getIterator()); - if (T.isVirtual() && MRI.use_nodbg_empty(T)) { - dropInstructionKeepingImpDefs(MovT, TII); + if (T.isVirtual() && MRI->use_nodbg_empty(T)) { + dropInstructionKeepingImpDefs(MovT); } else { Xop.setIsKill(false); for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { unsigned OpNo = MovT.getNumExplicitOperands() + I; const MachineOperand &Op = MovT.getOperand(OpNo); - if (Op.isKill() && TRI.regsOverlap(X, Op.getReg())) - MovT.RemoveOperand(OpNo); + if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) + MovT.removeOperand(OpNo); } } @@ -596,14 +708,32 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, return nullptr; } +// If an instruction has dead sdst replace it with NULL register on gfx1030+ +bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { + if (!ST->hasGFX10_3Insts()) + return false; + + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Op) + return false; + Register SDstReg = Op->getReg(); + if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) + return false; + + Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); + return true; +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + MRI = &MF.getRegInfo(); + ST = &MF.getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; std::vector<unsigned> I1Defs; @@ -628,7 +758,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { int32_t ReverseImm; - if (isReverseInlineImm(TII, Src, ReverseImm)) { + if (isReverseInlineImm(Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); Src.setImm(ReverseImm); continue; @@ -636,19 +766,15 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } } - if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || - MI.getOpcode() == AMDGPU::COPY)) { - if (auto *NextMI = matchSwap(MI, MRI, TII)) { + if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MI.getOpcode() == AMDGPU::COPY)) { + if (auto *NextMI = matchSwap(MI)) { Next = NextMI->getIterator(); continue; } } - // FIXME: We also need to consider movs of constant operands since - // immediate operands are not folded if they have more than one use, and - // the operand folding pass is unaware if the immediate will be free since - // it won't know if the src == dest constraint will end up being - // satisfied. + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { const MachineOperand *Dest = &MI.getOperand(0); @@ -664,13 +790,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. if (Dest->getReg().isVirtual() && Src0->isReg()) { - MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); - MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); + MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); + MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { - if (Src1->isImm() && isKImmOperand(TII, *Src1)) { + if (Src1->isImm() && isKImmOperand(*Src1)) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; @@ -682,7 +808,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // Try to use s_cmpk_* if (MI.isCompare() && TII->isSOPC(MI)) { - shrinkScalarCompare(TII, MI); + shrinkScalarCompare(MI); continue; } @@ -693,9 +819,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src.isImm() && Dst.getReg().isPhysical()) { int32_t ReverseImm; - if (isKImmOperand(TII, Src)) + if (isKImmOperand(Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - else if (isReverseInlineImm(TII, Src, ReverseImm)) { + else if (isReverseInlineImm(Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); Src.setImm(ReverseImm); } @@ -708,47 +834,70 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (MI.getOpcode() == AMDGPU::S_AND_B32 || MI.getOpcode() == AMDGPU::S_OR_B32 || MI.getOpcode() == AMDGPU::S_XOR_B32) { - if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + if (shrinkScalarLogicOp(MI)) continue; } if (TII->isMIMG(MI.getOpcode()) && - ST.getGeneration() >= AMDGPUSubtarget::GFX10 && + ST->getGeneration() >= AMDGPUSubtarget::GFX10 && MF.getProperties().hasProperty( MachineFunctionProperties::Property::NoVRegs)) { shrinkMIMG(MI); continue; } - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + if (!TII->isVOP3(MI)) + continue; + + if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || + MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { + shrinkMadFma(MI); continue; + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { + // If there is no chance we will shrink it and use VCC as sdst to get + // a 32 bit form try to replace dead sdst with NULL. + tryReplaceDeadSDST(MI); + continue; + } - if (!TII->canShrink(MI, MRI)) { + if (!TII->canShrink(MI, *MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(MI) || - !TII->canShrink(MI, MRI)) + !TII->canShrink(MI, *MRI)) { + tryReplaceDeadSDST(MI); continue; + } } int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { - Register DstReg = MI.getOperand(0).getReg(); - if (DstReg.isVirtual()) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because this is only one register and - // cannot deal with sequences which would require multiple copies of - // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) - // - // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we will run - // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); - continue; + MachineOperand &Op0 = MI.getOperand(0); + if (Op0.isReg()) { + // Exclude VOPCX instructions as these don't explicitly write a + // dst. + Register DstReg = Op0.getReg(); + if (DstReg.isVirtual()) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) + // + // So, instead of forcing the instruction to write to VCC, we + // provide a hint to the register allocator to use VCC and then we + // will run this pass again after RA and shrink it if it outputs to + // VCC. + MRI->setRegAllocationHint(DstReg, 0, VCCReg); + continue; + } + if (DstReg != VCCReg) + continue; } - if (DstReg != VCCReg) - continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { @@ -760,7 +909,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; Register SReg = Src2->getReg(); if (SReg.isVirtual()) { - MRI.setRegAllocationHint(SReg, 0, VCCReg); + MRI->setRegAllocationHint(SReg, 0, VCCReg); continue; } if (SReg != VCCReg) @@ -776,7 +925,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (SDst->getReg() != VCCReg) { if (SDst->getReg().isVirtual()) - MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); + MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); Next = true; } @@ -786,7 +935,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { AMDGPU::OpName::src2); if (Src2 && Src2->getReg() != VCCReg) { if (Src2->getReg().isVirtual()) - MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); + MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); Next = true; } @@ -801,14 +950,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { ++NumInstructionsShrunk; // Copy extra operands not present in the instruction definition. - copyExtraImplicitOps(*Inst32, MF, MI); + copyExtraImplicitOps(*Inst32, MI); // Copy deadness from the old explicit vcc def to the new implicit def. if (SDst && SDst->isDead()) Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); MI.eraseFromParent(); - foldImmediates(*Inst32, TII, MRI); + foldImmediates(*Inst32); LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 46efb3c605c6..a5798afab595 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -349,8 +349,7 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, const VNInfo *NextValue = nullptr; const VisitKey Key(Value, DefinedLanes); - if (!Visited.count(Key)) { - Visited.insert(Key); + if (Visited.insert(Key).second) { // On first visit to a phi then start processing first predecessor NextPredIdx = 0; } @@ -535,13 +534,36 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateStrictWWM; LowerToMovInstrs.push_back(&MI); continue; - } else if (Opcode == AMDGPU::STRICT_WQM) { + } else if (Opcode == AMDGPU::STRICT_WQM || + TII->isDualSourceBlendEXP(MI)) { // STRICT_WQM is similar to STRICTWWM, but instead of enabling all // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in // quads that have at least one active thread. markInstructionUses(MI, StateStrictWQM, Worklist); GlobalFlags |= StateStrictWQM; - LowerToMovInstrs.push_back(&MI); + + if (Opcode == AMDGPU::STRICT_WQM) { + LowerToMovInstrs.push_back(&MI); + } else { + // Dual source blend export acts as implicit strict-wqm, its sources + // need to be shuffled in strict wqm, but the export itself needs to + // run in exact mode. + BBI.Needs |= StateExact; + if (!(BBI.InNeeds & StateExact)) { + BBI.InNeeds |= StateExact; + Worklist.push_back(MBB); + } + GlobalFlags |= StateExact; + III.Disabled = StateWQM | StateStrict; + } + continue; + } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || + Opcode == AMDGPU::LDS_DIRECT_LOAD) { + // Mark these STRICTWQM, but only for the instruction, not its operands. + // This avoid unnecessarily marking M0 as requiring WQM. + InstrInfo &II = Instructions[&MI]; + II.Needs |= StateStrictWQM; + GlobalFlags |= StateStrictWQM; continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { @@ -969,7 +991,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, MachineInstr *WQMMaskMI = nullptr; Register LiveMaskWQM; if (IsDemote) { - // Demotes deactive quads with only helper lanes + // Demote - deactivate quads with only helper lanes LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); @@ -977,7 +999,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB, .addReg(Exec) .addReg(LiveMaskWQM); } else { - // Kills deactivate lanes + // Kill - deactivate lanes no longer in live mask if (Op.isImm()) { unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); @@ -1453,7 +1475,7 @@ void SIWholeQuadMode::lowerCopyInstrs() { } int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); while (Index >= 0) { - MI->RemoveOperand(Index); + MI->removeOperand(Index); Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); } MI->setDesc(TII->get(AMDGPU::COPY)); @@ -1468,7 +1490,7 @@ void SIWholeQuadMode::lowerCopyInstrs() { // an undef input so it is being replaced by a simple copy. // There should be a second undef source that we should remove. assert(MI->getOperand(2).isUndef()); - MI->RemoveOperand(2); + MI->removeOperand(2); MI->untieRegOperand(1); } else { assert(MI->getNumExplicitOperands() == 2); @@ -1588,11 +1610,11 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // Physical registers like SCC aren't tracked by default anyway, so just // removing the ranges we computed is the simplest option for maintaining // the analysis results. - LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); // If we performed any kills then recompute EXEC if (!KillInstrs.empty()) - LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); return true; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 184c871db775..882d13402a19 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -11,13 +11,19 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8", let OperandType = "OPERAND_IMMEDIATE"; } -def smem_offset : NamedOperandU32<"SMEMOffset", - NamedMatchClass<"SMEMOffset">> { +class SMEMOffset : NamedOperandU32<"SMEMOffset", + NamedMatchClass<"SMEMOffset">> { let OperandType = "OPERAND_IMMEDIATE"; let EncoderMethod = "getSMEMOffsetEncoding"; let DecoderMethod = "decodeSMEMOffset"; } +def smem_offset : SMEMOffset; + +def smem_offset_mod : SMEMOffset { + let PrintMethod = "printSMEMOffsetMod"; +} + //===----------------------------------------------------------------------===// // Scalar Memory classes //===----------------------------------------------------------------------===// @@ -43,13 +49,13 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt bits<1> has_sdst = 1; bit has_glc = 0; bit has_dlc = 0; - bits<1> has_offset = 1; - bits<1> offset_is_imm = 0; + bit has_offset = 0; + bit has_soffset = 0; bit is_buffer = 0; } -class SM_Real <SM_Pseudo ps> - : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> { +class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic> + : InstSI<ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands> { let isPseudo = 0; let isCodeGenOnly = 0; @@ -77,20 +83,40 @@ class SM_Real <SM_Pseudo ps> bits<7> sbase; bits<7> sdst; bits<32> offset; - bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); + bits<8> soffset; bits<5> cpol; } -class SM_Probe_Pseudo <string opName, dag ins, bit isImm> - : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> { +class OffsetMode<bit hasOffset, bit hasSOffset, string variant, + dag ins, string asm> { + bit HasOffset = hasOffset; + bit HasSOffset = hasSOffset; + string Variant = variant; + dag Ins = ins; + string Asm = asm; +} + +def IMM_Offset : OffsetMode<1, 0, "_IMM", (ins smem_offset:$offset), "$offset">; +def SGPR_Offset : OffsetMode<0, 1, "_SGPR", (ins SReg_32:$soffset), "$soffset">; +def SGPR_IMM_Offset : OffsetMode<1, 1, "_SGPR_IMM", + (ins SReg_32:$soffset, smem_offset_mod:$offset), + "$soffset$offset">; + +class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass, + dag offsets, string asmOffsets, + bit hasOffset, bit hasSOffset> + : SM_Pseudo<opName, (outs), + !con((ins i8imm:$sdata, baseClass:$sbase), offsets), + " $sdata, $sbase, " # asmOffsets> { let mayLoad = 0; let mayStore = 0; let has_glc = 0; let LGKM_CNT = 0; let ScalarStore = 0; let hasSideEffects = 1; - let offset_is_imm = isImm; - let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR"); + let has_offset = hasOffset; + let has_soffset = hasSOffset; + let PseudoInstr = opName # variant; } class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> @@ -102,10 +128,11 @@ class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> let has_dlc = 1; } -class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []> - : SM_Pseudo<opName, (outs), ins, asmOps, pattern> { - RegisterClass BaseClass; - RegisterClass SrcClass; +class SM_Store_Pseudo <string opName, RegisterClass baseClass, + RegisterClass srcClass, dag ins, string asmOps> + : SM_Pseudo<opName, (outs), ins, asmOps, []> { + RegisterClass BaseClass = baseClass; + RegisterClass SrcClass = srcClass; let mayLoad = 0; let mayStore = 1; let has_glc = 1; @@ -113,16 +140,19 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern let ScalarStore = 1; } -class SM_Discard_Pseudo <string opName, dag ins, bit isImm> - : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> { +class SM_Discard_Pseudo <string opName, string variant, dag offsets, + string asmOffsets, bit hasOffset, bit hasSOffset> + : SM_Pseudo<opName, (outs), !con((ins SReg_64:$sbase), offsets), + " $sbase, " # asmOffsets> { let mayLoad = 0; let mayStore = 0; let has_glc = 0; let has_sdst = 0; let ScalarStore = 0; let hasSideEffects = 1; - let offset_is_imm = isImm; - let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR"); + let has_offset = hasOffset; + let has_soffset = hasSOffset; + let PseudoInstr = opName # variant; } multiclass SM_Pseudo_Loads<string opName, @@ -132,7 +162,7 @@ multiclass SM_Pseudo_Loads<string opName, (outs dstClass:$sdst), (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol), " $sdst, $sbase, $offset$cpol", []> { - let offset_is_imm = 1; + let has_offset = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_IMM"; let has_glc = 1; @@ -141,39 +171,63 @@ multiclass SM_Pseudo_Loads<string opName, def _SGPR : SM_Load_Pseudo <opName, (outs dstClass:$sdst), - (ins baseClass:$sbase, SReg_32:$soff, CPol:$cpol), - " $sdst, $sbase, $offset$cpol", []> { + (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol), + " $sdst, $sbase, $soffset$cpol", []> { + let has_soffset = 1; let BaseClass = baseClass; let PseudoInstr = opName # "_SGPR"; let has_glc = 1; let has_dlc = 1; } + + def _SGPR_IMM : SM_Load_Pseudo <opName, + (outs dstClass:$sdst), + (ins baseClass:$sbase, SReg_32:$soffset, + i32imm:$offset, CPol:$cpol), + " $sdst, $sbase, $soffset$offset$cpol", []> { + let has_offset = 1; + let has_soffset = 1; + let BaseClass = baseClass; + let PseudoInstr = opName # "_SGPR_IMM"; + let has_glc = 1; + let has_dlc = 1; + } } multiclass SM_Pseudo_Stores<string opName, RegisterClass baseClass, RegisterClass srcClass> { - def _IMM : SM_Store_Pseudo <opName, + def _IMM : SM_Store_Pseudo <opName, baseClass, srcClass, (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, CPol:$cpol), - " $sdata, $sbase, $offset$cpol", []> { - let offset_is_imm = 1; - let BaseClass = baseClass; - let SrcClass = srcClass; + " $sdata, $sbase, $offset$cpol"> { + let has_offset = 1; let PseudoInstr = opName # "_IMM"; } - def _SGPR : SM_Store_Pseudo <opName, - (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, CPol:$cpol), - " $sdata, $sbase, $offset$cpol", []> { - let BaseClass = baseClass; - let SrcClass = srcClass; + def _SGPR : SM_Store_Pseudo <opName, baseClass, srcClass, + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, CPol:$cpol), + " $sdata, $sbase, $soffset$cpol"> { + let has_soffset = 1; let PseudoInstr = opName # "_SGPR"; } + + def _SGPR_IMM : SM_Store_Pseudo <opName, baseClass, srcClass, + (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soffset, i32imm:$offset, + CPol:$cpol), + " $sdata, $sbase, $soffset$offset$cpol"> { + let has_offset = 1; + let has_soffset = 1; + let PseudoInstr = opName # "_SGPR_IMM"; + } } multiclass SM_Pseudo_Discards<string opName> { - def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>; - def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>; + def _IMM : SM_Discard_Pseudo <opName, "_IMM", + (ins smem_offset:$offset), "$offset", 1, 0>; + def _SGPR : SM_Discard_Pseudo <opName, "_SGPR", + (ins SReg_32:$soffset), "$soffset", 0, 1>; + def _SGPR_IMM : SM_Discard_Pseudo <opName, "_SGPR_IMM", + (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>; } class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pseudo< @@ -184,21 +238,24 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse let mayStore = 0; let mayLoad = 0; let has_sbase = 0; - let has_offset = 0; } class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo< opName, (outs), (ins), "", [(node)]> { let hasSideEffects = 1; + let mayLoad = 0; let mayStore = 0; let has_sdst = 0; let has_sbase = 0; - let has_offset = 0; } multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> { - def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>; - def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>; + def _IMM : SM_Probe_Pseudo <opName, "_IMM", baseClass, + (ins smem_offset:$offset), "$offset", 1, 0>; + def _SGPR : SM_Probe_Pseudo <opName, "_SGPR", baseClass, + (ins SReg_32:$soffset), "$soffset", 0, 1>; + def _SGPR_IMM : SM_Probe_Pseudo <opName, "_SGPR_IMM", baseClass, + (ins SReg_32:$soffset, smem_offset_mod:$offset), "$soffset$offset", 1, 1>; } class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< @@ -206,9 +263,8 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< " $sdst", [(set i32:$sdst, (node))]> { let hasSideEffects = 1; let mayStore = 0; - let mayLoad = 1; + let mayLoad = 0; let has_sbase = 0; - let has_offset = 0; } //===----------------------------------------------------------------------===// @@ -225,6 +281,7 @@ class SM_Atomic_Pseudo <string opName, let mayStore = 1; let has_glc = 1; let has_dlc = 1; + let has_soffset = 1; // Should these be set? let ScalarStore = 1; @@ -240,21 +297,21 @@ class SM_Atomic_Pseudo <string opName, class SM_Pseudo_Atomic<string opName, RegisterClass baseClass, RegisterClass dataClass, - bit isImm, + OffsetMode offsets, bit isRet, - string opNameWithSuffix = opName # !if(isImm, - !if(isRet, "_IMM_RTN", "_IMM"), - !if(isRet, "_SGPR_RTN", "_SGPR")), + string opNameWithSuffix = + opName # offsets.Variant # !if(isRet, "_RTN", ""), Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> : SM_Atomic_Pseudo<opName, !if(isRet, (outs dataClass:$sdst), (outs)), - !if(isImm, - (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, CPolTy:$cpol), - (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, CPolTy:$cpol)), - !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset$cpol", + !con((ins dataClass:$sdata, baseClass:$sbase), offsets.Ins, + (ins CPolTy:$cpol)), + !if(isRet, " $sdst", " $sdata") # + ", $sbase, " # offsets.Asm # "$cpol", isRet>, AtomicNoRet <opNameWithSuffix, isRet> { - let offset_is_imm = isImm; + let has_offset = offsets.HasOffset; + let has_soffset = offsets.HasSOffset; let PseudoInstr = opNameWithSuffix; let Constraints = !if(isRet, "$sdst = $sdata", ""); @@ -264,10 +321,12 @@ class SM_Pseudo_Atomic<string opName, multiclass SM_Pseudo_Atomics<string opName, RegisterClass baseClass, RegisterClass dataClass> { - def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>; - def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>; - def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>; - def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>; + def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 0>; + def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 0>; + def _SGPR_IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 0>; + def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 1>; + def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 1>; + def _SGPR_IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 1>; } //===----------------------------------------------------------------------===// @@ -452,16 +511,14 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps> let AssemblerPredicate = isGFX6GFX7; let DecoderNamespace = "GFX6GFX7"; - let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); - let Inst{8} = imm; + let Inst{7-0} = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?)); + let Inst{8} = ps.has_offset; let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?); let Inst{26-22} = op; let Inst{31-27} = 0x18; //encoding } -// FIXME: Assembler should reject trying to use glc on SMRD -// instructions on SI. multiclass SM_Real_Loads_si<bits<5> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { @@ -470,10 +527,8 @@ multiclass SM_Real_Loads_si<bits<5> op, string ps, let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol); } - // FIXME: The operand name $offset is inconsistent with $soff used - // in the pseudo def _SGPR_si : SMRD_Real_si <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); } } @@ -494,42 +549,82 @@ def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>; //===----------------------------------------------------------------------===// -// VI +// VI and GFX9. //===----------------------------------------------------------------------===// class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> : SM_Real<ps> , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> , Enc64 { - let AssemblerPredicate = isGFX8GFX9; + field bit IsGFX9SpecificEncoding = false; + let AssemblerPredicate = !if(IsGFX9SpecificEncoding, isGFX9Only, isGFX8GFX9); let DecoderNamespace = "GFX8"; let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); + // Note that for GFX9 instructions with immediate offsets, soffset_en + // must be defined, whereas in GFX8 it's undefined in all cases, + // meaning GFX9 is not perfectly backward-compatible with GFX8, despite + // documentation suggesting otherwise. + field bit SOffsetEn = !if(IsGFX9SpecificEncoding, + !if(ps.has_offset, ps.has_soffset, !if(ps.has_soffset, 0, ?)), + ?); + let Inst{14} = SOffsetEn; + let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); - let Inst{17} = imm; + + // imm + // TODO: Shall not be defined if the instruction has no offset nor + // soffset. + let Inst{17} = ps.has_offset; + let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed. // Offset value is corrected accordingly when offset is encoded/decoded. - let Inst{38-32} = !if(ps.has_offset, offset{6-0}, ?); - let Inst{52-39} = !if(ps.has_offset, !if(imm, offset{20-7}, ?), ?); + // TODO: Forbid non-M0 register offsets for GFX8 stores and atomics. + field bits<21> Offset; + let Offset{6-0} = !if(ps.has_offset, offset{6-0}, + !if(ps.has_soffset, soffset{6-0}, ?)); + let Offset{20-7} = !if(ps.has_offset, offset{20-7}, ?); + let Inst{52-32} = Offset; + + // soffset + let Inst{63-57} = !if(!and(IsGFX9SpecificEncoding, ps.has_soffset), + soffset{6-0}, ?); } -multiclass SM_Real_Loads_vi<bits<8> op, string ps, - SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), - SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { - def _IMM_vi : SMEM_Real_vi <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); - } - def _SGPR_vi : SMEM_Real_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); - } +class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets> + : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol)); } -class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { +// The alternative GFX9 SGPR encoding using soffset to encode the +// offset register. Not available in assembler and goes to the GFX9 +// encoding family to avoid conflicts with the primary SGPR variant. +class SMEM_Real_SGPR_alt_gfx9 { + bit IsGFX9SpecificEncoding = true; + bit SOffsetEn = 1; + bit Offset = ?; + int Subtarget = SIEncodingFamily.GFX9; + string AsmVariantName = "NonParsable"; +} + +multiclass SM_Real_Loads_vi<bits<8> op, string ps> { + def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>; + def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>; + def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR", + (ins SReg_32:$soffset)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi < + op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>; +} + +class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { // encoding bits<7> sdata; @@ -537,23 +632,34 @@ class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); } -multiclass SM_Real_Stores_vi<bits<8> op, string ps, - SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM), - SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> { - // FIXME: The operand name $offset is inconsistent with $soff used - // in the pseudo - def _IMM_vi : SMEM_Real_Store_vi <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); - } +class SMEM_Real_Store_vi <bits<8> op, string ps, dag offsets> + : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps)> { + RegisterClass SrcClass = !cast<SM_Store_Pseudo>(ps).SrcClass; + RegisterClass BaseClass = !cast<SM_Store_Pseudo>(ps).BaseClass; + let InOperandList = !con((ins SrcClass:$sdata, BaseClass:$sbase), + offsets, (ins CPol:$cpol)); +} - def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); - } +multiclass SM_Real_Stores_vi<bits<8> op, string ps> { + def _IMM_vi : SMEM_Real_Store_vi <op, ps#_IMM, (ins smem_offset:$offset)>; + def _SGPR_vi : SMEM_Real_Store_vi <op, ps#_SGPR, (ins SReg_32:$soffset)>; + def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR", + (ins SReg_32:$soffset)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi < + op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>; } multiclass SM_Real_Probe_vi<bits<8> op, string ps> { - def _IMM_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; - def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; + def _IMM_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; + def _SGPR_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; + def _SGPR_alt_gfx9 + : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_gfx9 + : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>; } defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">; @@ -614,8 +720,20 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps> multiclass SM_Real_Atomics_vi<bits<8> op, string ps> { def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; + def _SGPR_alt_gfx9 + : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_gfx9 + : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>; def _IMM_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>; def _SGPR_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>; + def _SGPR_RTN_alt_gfx9 + : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_RTN_gfx9 + : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>; } defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">; @@ -677,6 +795,10 @@ defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2"> multiclass SM_Real_Discard_vi<bits<8> op, string ps> { def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>; def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>; + def _SGPR_alt_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>, + SMEM_Real_SGPR_alt_gfx9; + let IsGFX9SpecificEncoding = true in + def _SGPR_IMM_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR_IMM)>; } defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">; @@ -727,8 +849,8 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps> let AssemblerPredicate = isGFX7Only; let DecoderNamespace = "GFX7"; - let Inst{7-0} = !if(ps.has_offset, offset{7-0}, ?); - let Inst{8} = imm; + let Inst{7-0} = !if(ps.has_offset, offset{7-0}, !if(ps.has_soffset, soffset, ?)); + let Inst{8} = ps.has_offset; let Inst{14-9} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?); let Inst{26-22} = op; @@ -876,20 +998,27 @@ def : GCNPat < // GFX10. //===----------------------------------------------------------------------===// -class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> : - SM_Real<ps>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX10>, Enc64 { - let AssemblerPredicate = isGFX10Plus; - let DecoderNamespace = "GFX10"; - +class SMEM_Real_10Plus_common<bits<8> op, SM_Pseudo ps, string opName, + int subtarget, RegisterWithSubRegs sgpr_null> : + SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 { let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?); let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?); - let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?); - let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); let Inst{25-18} = op; let Inst{31-26} = 0x3d; - let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?); - let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding), - !if(ps.has_offset, offset{6-0}, ?)); + // There are SMEM instructions that do not employ any of the offset + // fields, in which case we need them to remain undefined. + let Inst{52-32} = !if(ps.has_offset, offset{20-0}, !if(ps.has_soffset, 0, ?)); + let Inst{63-57} = !if(ps.has_soffset, soffset{6-0}, + !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?)); +} + +class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> + : SMEM_Real_10Plus_common<op, ps, ps.Mnemonic, SIEncodingFamily.GFX10, + SGPR_NULL_gfxpre11> { + let AssemblerPredicate = isGFX10Only; + let DecoderNamespace = "GFX10"; + let Inst{14} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ?); + let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); } multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, @@ -899,7 +1028,11 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); + } + def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> { + let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, + smem_offset_mod:$offset, CPol:$cpol); } } @@ -913,14 +1046,17 @@ class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> multiclass SM_Real_Stores_gfx10<bits<8> op, string ps, SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM), SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> { - // FIXME: The operand name $offset is inconsistent with $soff used - // in the pseudo def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> { let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); } def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, CPol:$cpol); + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); + } + + def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Store_Pseudo>(ps#_SGPR_IMM)> { + let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, + SReg_32:$soffset, smem_offset_mod:$offset, CPol:$cpol); } } @@ -969,6 +1105,8 @@ def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>; multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> { def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; + def _SGPR_IMM_gfx10 + : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>; } defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">; @@ -992,8 +1130,10 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> { def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; + def _SGPR_IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>; def _IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>; def _SGPR_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>; + def _SGPR_IMM_RTN_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>; } let SubtargetPredicate = HasScalarAtomics in { @@ -1057,6 +1197,7 @@ defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> { def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; + def _SGPR_IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>; } defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">; @@ -1072,3 +1213,64 @@ def SMInfoTable : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getSMEMOpcodeHelper"; } + +//===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> : + SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11, + SGPR_NULL_gfx11plus> { + let AssemblerPredicate = isGFX11Plus; + let DecoderNamespace = "GFX11"; + let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0); + let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0); +} + +class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> : + SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> { + RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass; + let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol)); +} + +multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> { + def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>; + def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>; + def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11< + op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>; + def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>, + Requires<[isGFX11Plus]>; +} + +defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD", "s_load_b32">; +defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2", "s_load_b64">; +defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4", "s_load_b128">; +defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8", "s_load_b256">; +defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16", "s_load_b512">; + +defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD", "s_buffer_load_b32">; +defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2", "s_buffer_load_b64">; +defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4", "s_buffer_load_b128">; +defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8", "s_buffer_load_b256">; +defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16", "s_buffer_load_b512">; + +def S_GL1_INV_gfx11 : SMEM_Real_gfx11<0x020, S_GL1_INV>; +def S_DCACHE_INV_gfx11 : SMEM_Real_gfx11<0x021, S_DCACHE_INV>; + +class SMEM_Real_Store_gfx11 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx11<op, ps> { + // encoding + bits<7> sdata; + + let sdst = ?; + let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); +} + +multiclass SM_Real_Probe_gfx11<bits<8> op, string ps> { + def _IMM_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; + def _SGPR_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; + def _SGPR_IMM_gfx11 + : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>; +} + +defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22, "S_ATC_PROBE">; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23, "S_ATC_PROBE_BUFFER">; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 3f7837f7dbf1..37d20045adb5 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -152,8 +152,8 @@ class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < } // 64-bit input, no output -class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : SOP1_Pseudo < - opName, (outs), (ins rc:$src0), "$src0", pattern> { +class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SReg_64:$src0), "$src0", pattern> { let has_sdst = 0; } @@ -235,10 +235,10 @@ def : GCNPat < let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def S_BREV_B32 : SOP1_32 <"s_brev_b32", - [(set i32:$sdst, (bitreverse i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<bitreverse> i32:$src0))] >; def S_BREV_B64 : SOP1_64 <"s_brev_b64", - [(set i64:$sdst, (bitreverse i64:$src0))] + [(set i64:$sdst, (UniformUnaryFrag<bitreverse> i64:$src0))] >; } // End isReMaterializable = 1, isAsCheapAsAMove = 1 @@ -276,10 +276,10 @@ def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", >; def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", - [(set i32:$sdst, (sext_inreg i32:$src0, i8))] + [(set i32:$sdst, (UniformSextInreg<i8> i32:$src0))] >; def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", - [(set i32:$sdst, (sext_inreg i32:$src0, i16))] + [(set i32:$sdst, (UniformSextInreg<i16> i32:$src0))] >; } // End isReMaterializable = 1 @@ -300,8 +300,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; let isReturn = 1 in { // Define variant marked as return rather than branch. -def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>; -def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>; +def S_SETPC_B64_return : SOP1_1<"">; } } // End isTerminator = 1, isBarrier = 1 @@ -341,7 +340,7 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">; let Defs = [SCC] in { def S_ABS_I32 : SOP1_32 <"s_abs_i32", - [(set i32:$sdst, (abs i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<abs> i32:$src0))] >; } // End Defs = [SCC] @@ -385,6 +384,21 @@ let SubtargetPredicate = isGFX10Plus in { } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus +let SubtargetPredicate = isGFX11Plus in { + let hasSideEffects = 1 in { + // For s_sendmsg_rtn_* the src0 field encodes the message type directly; it + // is not an SGPR number. + def S_SENDMSG_RTN_B32 : SOP1_Pseudo< + "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsgImm:$src0), + "$sdst, $src0", [(set i32:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))] + >; + def S_SENDMSG_RTN_B64 : SOP1_Pseudo< + "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsgImm:$src0), + "$sdst, $src0", [(set i64:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))] + >; + } +} // End SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// // SOP2 Instructions //===----------------------------------------------------------------------===// @@ -690,6 +704,10 @@ let SubtargetPredicate = isGFX9Plus in { } // End isCommutable = 1, isReMaterializable = 1 } // End SubtargetPredicate = isGFX9Plus +let SubtargetPredicate = isGFX11Plus in { + def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">; +} // End SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// // SOPK Instructions //===----------------------------------------------------------------------===// @@ -855,9 +873,7 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo < "$sdst, $simm16" >; -let mayLoad = 1 in { -// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow -// its use in the readcyclecounter selection. +// This is hasSideEffects to allow its use in readcyclecounter selection. // FIXME: Need to truncate immediate to 16-bits. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", @@ -867,7 +883,6 @@ def S_GETREG_B32 : SOPK_Pseudo < let SOPKZext = 1; let hasSideEffects = 1; } -} // End mayLoad = 1 let Defs = [MODE], Uses = [MODE] in { @@ -1169,12 +1184,12 @@ def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> { let isReturn = 1; } -let SubtargetPredicate = isGFX9Plus in { +let SubtargetPredicate = isGFX9GFX10 in { let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in { def S_ENDPGM_ORDERED_PS_DONE : SOPP_Pseudo<"s_endpgm_ordered_ps_done", (ins)>; } // End isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 -} // End SubtargetPredicate = isGFX9Plus +} // End SubtargetPredicate = isGFX9GFX10 let SubtargetPredicate = isGFX10Plus in { let isBarrier = 1, isReturn = 1, simm16 = 0, fixed_imm = 1 in { @@ -1279,15 +1294,21 @@ def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16), let hasSideEffects = 1; } -def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">; +def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16", + [(int_amdgcn_s_setprio timm:$simm16)]> { + let hasSideEffects = 1; +} let Uses = [EXEC, M0] in { -// FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16", - [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>; + [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> { + let hasSideEffects = 1; +} def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16", - [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>; + [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> { + let hasSideEffects = 1; +} } // End Uses = [EXEC, M0] @@ -1341,7 +1362,7 @@ let SubtargetPredicate = isGFX10Plus in { let fixed_imm = 1; } def S_WAITCNT_DEPCTR : - SOPP_Pseudo <"s_waitcnt_depctr" , (ins s16imm:$simm16), "$simm16">; + SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtrImm:$simm16), "$simm16">; let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in { def S_ROUND_MODE : @@ -1355,6 +1376,13 @@ let SubtargetPredicate = isGFX10Plus in { SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">; } // End SubtargetPredicate = isGFX10Plus +let SubtargetPredicate = isGFX11Plus in { + def S_WAIT_EVENT : SOPP_Pseudo<"s_wait_event", (ins s16imm:$simm16), + "$simm16">; + def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins DELAY_FLAG:$simm16), + "$simm16">; +} // End SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -1377,7 +1405,7 @@ def : GCNPat < >; def : GCNPat < - (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 SReg_32:$x) >; @@ -1408,7 +1436,7 @@ def : GCNPat < // REG_SEQUENCE patterns don't support instructions with multiple // outputs. def : GCNPat< - (i64 (zext i16:$src)), + (i64 (UniformUnaryFrag<zext> i16:$src)), (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, (S_MOV_B32 (i32 0)), sub1) @@ -1421,7 +1449,7 @@ def : GCNPat < >; def : GCNPat< - (i32 (zext i16:$src)), + (i32 (UniformUnaryFrag<zext> i16:$src)), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) >; @@ -1448,8 +1476,13 @@ def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>; // Target-specific instruction encodings. //===----------------------------------------------------------------------===// +class Select_gfx11<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX11> { + Predicate AssemblerPredicate = isGFX11Only; + string DecoderNamespace = "GFX11"; +} + class Select_gfx10<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX10> { - Predicate AssemblerPredicate = isGFX10Plus; + Predicate AssemblerPredicate = isGFX10Only; string DecoderNamespace = "GFX10"; } @@ -1464,6 +1497,87 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> { } //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +multiclass SOP1_Real_gfx11<bits<8> op> { + def _gfx11 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>, + Select_gfx11<!cast<SOP1_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP1_Real_Renamed_gfx11<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> { + def _gfx11 : SOP1_Real<op, backing_pseudo, real_name>, + Select_gfx11<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; +} + +defm S_MOV_B32 : SOP1_Real_gfx11<0x000>; +defm S_MOV_B64 : SOP1_Real_gfx11<0x001>; +defm S_CMOV_B32 : SOP1_Real_gfx11<0x002>; +defm S_CMOV_B64 : SOP1_Real_gfx11<0x003>; +defm S_BREV_B32 : SOP1_Real_gfx11<0x004>; +defm S_BREV_B64 : SOP1_Real_gfx11<0x005>; +defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">; +defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">; +defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">; +defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">; +defm S_CLS_I32 : SOP1_Real_Renamed_gfx11<0x00c, S_FLBIT_I32, "s_cls_i32">; +defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">; +defm S_SEXT_I32_I8 : SOP1_Real_gfx11<0x00e>; +defm S_SEXT_I32_I16 : SOP1_Real_gfx11<0x00f>; +defm S_BITSET0_B32 : SOP1_Real_gfx11<0x010>; +defm S_BITSET0_B64 : SOP1_Real_gfx11<0x011>; +defm S_BITSET1_B32 : SOP1_Real_gfx11<0x012>; +defm S_BITSET1_B64 : SOP1_Real_gfx11<0x013>; +defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11<0x014>; +defm S_ABS_I32 : SOP1_Real_gfx11<0x015>; +defm S_BCNT0_I32_B32 : SOP1_Real_gfx11<0x016>; +defm S_BCNT0_I32_B64 : SOP1_Real_gfx11<0x017>; +defm S_BCNT1_I32_B32 : SOP1_Real_gfx11<0x018>; +defm S_BCNT1_I32_B64 : SOP1_Real_gfx11<0x019>; +defm S_QUADMASK_B32 : SOP1_Real_gfx11<0x01a>; +defm S_QUADMASK_B64 : SOP1_Real_gfx11<0x01b>; +defm S_WQM_B32 : SOP1_Real_gfx11<0x01c>; +defm S_WQM_B64 : SOP1_Real_gfx11<0x01d>; +defm S_NOT_B32 : SOP1_Real_gfx11<0x01e>; +defm S_NOT_B64 : SOP1_Real_gfx11<0x01f>; +defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x020>; +defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x021>; +defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x022>; +defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x023>; +defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x024>; +defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x025>; +defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x026>; +defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x027>; +defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x028>; +defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x029>; +defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x02a>; +/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x02b>; //same as older arch, handled there*/ +defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">; +defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">; +defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">; +defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">; +defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">; +defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">; +defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">; +defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">; +defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">; +defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">; +defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">; +defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">; +defm S_MOVRELS_B32 : SOP1_Real_gfx11<0x040>; +defm S_MOVRELS_B64 : SOP1_Real_gfx11<0x041>; +defm S_MOVRELD_B32 : SOP1_Real_gfx11<0x042>; +defm S_MOVRELD_B64 : SOP1_Real_gfx11<0x043>; +defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11<0x044>; +defm S_GETPC_B64 : SOP1_Real_gfx11<0x047>; +defm S_SETPC_B64 : SOP1_Real_gfx11<0x048>; +defm S_SWAPPC_B64 : SOP1_Real_gfx11<0x049>; +defm S_RFE_B64 : SOP1_Real_gfx11<0x04a>; +defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11<0x04c>; +defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11<0x04d>; + +//===----------------------------------------------------------------------===// // SOP1 - GFX10. //===----------------------------------------------------------------------===// @@ -1473,6 +1587,9 @@ multiclass SOP1_Real_gfx10<bits<8> op> { Select_gfx10<ps.Mnemonic>; } +multiclass SOP1_Real_gfx10_gfx11<bits<8> op> : + SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>; + defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>; defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>; defm S_ANDN1_WREXEC_B64 : SOP1_Real_gfx10<0x039>; @@ -1493,7 +1610,7 @@ defm S_ANDN2_WREXEC_B32 : SOP1_Real_gfx10<0x047>; defm S_MOVRELSD_2_B32 : SOP1_Real_gfx10<0x049>; //===----------------------------------------------------------------------===// -// SOP1 - GFX6, GFX7. +// SOP1 - GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// @@ -1506,6 +1623,9 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> { multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> : SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>; +multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> : + SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11<op>; + defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>; defm S_MOV_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x003>; @@ -1547,7 +1667,7 @@ defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>; defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>; defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>; defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02b>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>; defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>; defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>; @@ -1557,6 +1677,65 @@ defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>; defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>; //===----------------------------------------------------------------------===// +// SOP2 - GFX11. +//===----------------------------------------------------------------------===// + +multiclass SOP2_Real_gfx11<bits<7> op> { + def _gfx11 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>, + Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOP2_Real_Renamed_gfx11<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> { + def _gfx11 : SOP2_Real<op, backing_pseudo, real_name>, + Select_gfx11<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; +} + +defm S_ABSDIFF_I32 : SOP2_Real_gfx11<0x006>; +defm S_LSHL_B32 : SOP2_Real_gfx11<0x008>; +defm S_LSHL_B64 : SOP2_Real_gfx11<0x009>; +defm S_LSHR_B32 : SOP2_Real_gfx11<0x00a>; +defm S_LSHR_B64 : SOP2_Real_gfx11<0x00b>; +defm S_ASHR_I32 : SOP2_Real_gfx11<0x00c>; +defm S_ASHR_I64 : SOP2_Real_gfx11<0x00d>; +defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11<0x00e>; +defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11<0x00f>; +defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11<0x010>; +defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11<0x011>; +defm S_MIN_I32 : SOP2_Real_gfx11<0x012>; +defm S_MIN_U32 : SOP2_Real_gfx11<0x013>; +defm S_MAX_I32 : SOP2_Real_gfx11<0x014>; +defm S_MAX_U32 : SOP2_Real_gfx11<0x015>; +defm S_AND_B32 : SOP2_Real_gfx11<0x016>; +defm S_AND_B64 : SOP2_Real_gfx11<0x017>; +defm S_OR_B32 : SOP2_Real_gfx11<0x018>; +defm S_OR_B64 : SOP2_Real_gfx11<0x019>; +defm S_XOR_B32 : SOP2_Real_gfx11<0x01a>; +defm S_XOR_B64 : SOP2_Real_gfx11<0x01b>; +defm S_NAND_B32 : SOP2_Real_gfx11<0x01c>; +defm S_NAND_B64 : SOP2_Real_gfx11<0x01d>; +defm S_NOR_B32 : SOP2_Real_gfx11<0x01e>; +defm S_NOR_B64 : SOP2_Real_gfx11<0x01f>; +defm S_XNOR_B32 : SOP2_Real_gfx11<0x020>; +defm S_XNOR_B64 : SOP2_Real_gfx11<0x021>; +defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x022, S_ANDN2_B32, "s_and_not1_b32">; +defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x023, S_ANDN2_B64, "s_and_not1_b64">; +defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x024, S_ORN2_B32, "s_or_not1_b32">; +defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x025, S_ORN2_B64, "s_or_not1_b64">; +defm S_BFE_U32 : SOP2_Real_gfx11<0x026>; +defm S_BFE_I32 : SOP2_Real_gfx11<0x027>; +defm S_BFE_U64 : SOP2_Real_gfx11<0x028>; +defm S_BFE_I64 : SOP2_Real_gfx11<0x029>; +defm S_BFM_B32 : SOP2_Real_gfx11<0x02a>; +defm S_BFM_B64 : SOP2_Real_gfx11<0x02b>; +defm S_MUL_I32 : SOP2_Real_gfx11<0x02c>; +defm S_MUL_HI_U32 : SOP2_Real_gfx11<0x02d>; +defm S_MUL_HI_I32 : SOP2_Real_gfx11<0x02e>; +defm S_CSELECT_B32 : SOP2_Real_gfx11<0x030>; +defm S_CSELECT_B64 : SOP2_Real_gfx11<0x031>; +defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>; + +//===----------------------------------------------------------------------===// // SOP2 - GFX10. //===----------------------------------------------------------------------===// @@ -1566,13 +1745,16 @@ multiclass SOP2_Real_gfx10<bits<7> op> { Select_gfx10<ps.Mnemonic>; } +multiclass SOP2_Real_gfx10_gfx11<bits<7> op> : + SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>; + defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>; defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>; defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>; defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>; -defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10<0x032>; -defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10<0x033>; -defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10<0x034>; +defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11<0x032>; +defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11<0x033>; +defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11<0x034>; defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>; defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>; @@ -1589,14 +1771,17 @@ multiclass SOP2_Real_gfx6_gfx7<bits<7> op> { multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> : SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>; +multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<7> op> : + SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11<op>; + defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>; -defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x000>; -defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x001>; -defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x002>; -defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x003>; -defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x004>; -defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm S_ADD_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x000>; +defm S_SUB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x001>; +defm S_ADD_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x002>; +defm S_SUB_I32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>; +defm S_ADDC_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>; +defm S_SUBB_U32 : SOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>; defm S_MIN_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x006>; defm S_MIN_U32 : SOP2_Real_gfx6_gfx7_gfx10<0x007>; defm S_MAX_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x008>; @@ -1635,6 +1820,31 @@ defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>; defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>; //===----------------------------------------------------------------------===// +// SOPK - GFX11. +//===----------------------------------------------------------------------===// + +multiclass SOPK_Real32_gfx11<bits<5> op> { + def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +multiclass SOPK_Real64_gfx11<bits<5> op> { + def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>, + Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>; +} + +defm S_GETREG_B32 : SOPK_Real32_gfx11<0x011>; +defm S_SETREG_B32 : SOPK_Real32_gfx11<0x012>; +defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11<0x013>; +defm S_CALL_B64 : SOPK_Real32_gfx11<0x014>; +defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>; +defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; + +//===----------------------------------------------------------------------===// // SOPK - GFX10. //===----------------------------------------------------------------------===// @@ -1650,7 +1860,10 @@ multiclass SOPK_Real64_gfx10<bits<5> op> { Select_gfx10<ps.Mnemonic>; } -defm S_VERSION : SOPK_Real32_gfx10<0x001>; +multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> : + SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>; + +defm S_VERSION : SOPK_Real32_gfx10_gfx11<0x001>; defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>; defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>; defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>; @@ -1681,29 +1894,96 @@ multiclass SOPK_Real32_gfx6_gfx7_gfx10<bits<5> op> : multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> : SOPK_Real64_gfx6_gfx7<op>, SOPK_Real64_gfx10<op>; +multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> : + SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>; + defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>; -defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x000>; -defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x002>; -defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x003>; -defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x004>; -defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x005>; -defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x006>; -defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x007>; -defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x008>; -defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x009>; -defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00a>; -defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00b>; -defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00c>; -defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00d>; -defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00e>; -defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x00f>; -defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10<0x010>; +defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x000>; +defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x002>; +defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>; +defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>; +defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>; +defm S_CMPK_GE_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x006>; +defm S_CMPK_LT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x007>; +defm S_CMPK_LE_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x008>; +defm S_CMPK_EQ_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x009>; +defm S_CMPK_LG_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00a>; +defm S_CMPK_GT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00b>; +defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>; +defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>; +defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>; +defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00f>; +defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x010>; defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>; defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>; defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>; //===----------------------------------------------------------------------===// +// SOPP - GFX11 +//===----------------------------------------------------------------------===// + +multiclass SOPP_Real_32_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { + def _gfx11 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), real_name>, + Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">; +} + +multiclass SOPP_Real_64_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { + def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), real_name>, + Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>, + SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">; +} + +multiclass SOPP_Real_32_Renamed_gfx11<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> { + def _gfx11 : SOPP_Real_32<op, backing_pseudo, real_name # " ">, + Select_gfx11<backing_pseudo.Mnemonic>, + MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>; +} + +multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> { + defm "" : SOPP_Real_32_gfx11<op>; + defm _pad_s_nop : SOPP_Real_64_gfx11<op>; +} + +defm S_SETKILL : SOPP_Real_32_gfx11<0x001>; +defm S_SETHALT : SOPP_Real_32_gfx11<0x002>; +defm S_SLEEP : SOPP_Real_32_gfx11<0x003>; +defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">; +defm S_CLAUSE : SOPP_Real_32_gfx11<0x005>; +defm S_DELAY_ALU : SOPP_Real_32_gfx11<0x007>; +defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>; +defm S_WAITCNT : SOPP_Real_32_gfx11<0x009>; +defm S_WAIT_IDLE : SOPP_Real_32_gfx11<0x00a>; +defm S_WAIT_EVENT : SOPP_Real_32_gfx11<0x00b>; +defm S_TRAP : SOPP_Real_32_gfx11<0x010>; +defm S_ROUND_MODE : SOPP_Real_32_gfx11<0x011>; +defm S_DENORM_MODE : SOPP_Real_32_gfx11<0x012>; +defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11<0x020>; +defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11<0x021>; +defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11<0x022>; +defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11<0x023>; +defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11<0x024>; +defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11<0x025>; +defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11<0x026>; +defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>; +defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>; +defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>; +defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>; +defm S_ENDPGM : SOPP_Real_32_gfx11<0x030, "s_endpgm">; +defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11<0x031>; +defm S_WAKEUP : SOPP_Real_32_gfx11<0x034>; +defm S_SETPRIO : SOPP_Real_32_gfx11<0x035>; +defm S_SENDMSG : SOPP_Real_32_gfx11<0x036>; +defm S_SENDMSGHALT : SOPP_Real_32_gfx11<0x037>; +defm S_INCPERFLEVEL : SOPP_Real_32_gfx11<0x038>; +defm S_DECPERFLEVEL : SOPP_Real_32_gfx11<0x039>; +defm S_TTRACEDATA : SOPP_Real_32_gfx11<0x03a>; +defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11<0x03b>; +defm S_ICACHE_INV : SOPP_Real_32_gfx11<0x03c>; +defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>; + +//===----------------------------------------------------------------------===// // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10 //===----------------------------------------------------------------------===// @@ -1737,6 +2017,12 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_32_gfx10<op, real_name>; +multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : + SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>; + +multiclass SOPP_Real_32_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : + SOPP_Real_32_gfx10<op, real_name>, SOPP_Real_32_gfx11<op, real_name>; + //64 bit encodings, for Relaxation multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> { defvar ps = !cast<SOPP_Pseudo>(NAME); @@ -1768,13 +2054,16 @@ multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op, string real_name = !cast multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op, real_name>, SOPP_Real_64_gfx10<op, real_name>; +multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op, string real_name = !cast<SOPP_Pseudo>(NAME).Mnemonic # " "> : + SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op, real_name>, SOPP_Real_64_gfx11<op, real_name>; + //relaxation for insts with no operands not implemented multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> { defm "" : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>; defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>; } -defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x000>; +defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<0x000>; defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001, "s_endpgm">; defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>; defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>; @@ -1794,7 +2083,7 @@ defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>; defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>; defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>; defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>; -defm S_CODE_END : SOPP_Real_32_gfx10<0x01f>; +defm S_CODE_END : SOPP_Real_32_gfx10_gfx11<0x01f>; defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>; defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>; defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>; @@ -1818,6 +2107,34 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_ } //===----------------------------------------------------------------------===// +// SOPC - GFX11 +//===----------------------------------------------------------------------===// + +multiclass SOPC_Real_gfx11<bits<7> op> { + def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>, + Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>; +} + +defm S_CMP_EQ_I32 : SOPC_Real_gfx11<0x00>; +defm S_CMP_LG_I32 : SOPC_Real_gfx11<0x01>; +defm S_CMP_GT_I32 : SOPC_Real_gfx11<0x02>; +defm S_CMP_GE_I32 : SOPC_Real_gfx11<0x03>; +defm S_CMP_LT_I32 : SOPC_Real_gfx11<0x04>; +defm S_CMP_LE_I32 : SOPC_Real_gfx11<0x05>; +defm S_CMP_EQ_U32 : SOPC_Real_gfx11<0x06>; +defm S_CMP_LG_U32 : SOPC_Real_gfx11<0x07>; +defm S_CMP_GT_U32 : SOPC_Real_gfx11<0x08>; +defm S_CMP_GE_U32 : SOPC_Real_gfx11<0x09>; +defm S_CMP_LT_U32 : SOPC_Real_gfx11<0x0a>; +defm S_CMP_LE_U32 : SOPC_Real_gfx11<0x0b>; +defm S_BITCMP0_B32 : SOPC_Real_gfx11<0x0c>; +defm S_BITCMP1_B32 : SOPC_Real_gfx11<0x0d>; +defm S_BITCMP0_B64 : SOPC_Real_gfx11<0x0e>; +defm S_BITCMP1_B64 : SOPC_Real_gfx11<0x0f>; +defm S_CMP_EQ_U64 : SOPC_Real_gfx11<0x10>; +defm S_CMP_LG_U64 : SOPC_Real_gfx11<0x11>; + +//===----------------------------------------------------------------------===// // SOPC - GFX6, GFX7, GFX8, GFX9, GFX10 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 18c348d1cf89..c0fd5bc69325 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -6,33 +6,64 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUAsmUtils.h" +#include "AMDGPUBaseInfo.h" #include "SIDefines.h" -#include "llvm/ADT/StringRef.h" - namespace llvm { namespace AMDGPU { + +namespace DepCtr { + +// NOLINTBEGIN +const CustomOperandVal DepCtrInfo[] = { + // Name max dflt offset width constraint + {{"depctr_hold_cnt"}, 1, 1, 7, 1, isGFX10_BEncoding}, + {{"depctr_sa_sdst"}, 1, 1, 0, 1}, + {{"depctr_va_vdst"}, 15, 15, 12, 4}, + {{"depctr_va_sdst"}, 7, 7, 9, 3}, + {{"depctr_va_ssrc"}, 1, 1, 8, 1}, + {{"depctr_va_vcc"}, 1, 1, 1, 1}, + {{"depctr_vm_vsrc"}, 7, 7, 2, 3}, +}; +// NOLINTEND + +const int DEP_CTR_SIZE = + static_cast<int>(sizeof(DepCtrInfo) / sizeof(CustomOperandVal)); + +} // namespace DepCtr + namespace SendMsg { -// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h. -const char *const IdSymbolic[ID_GAPS_LAST_] = { - nullptr, - "MSG_INTERRUPT", - "MSG_GS", - "MSG_GS_DONE", - "MSG_SAVEWAVE", - "MSG_STALL_WAVE_GEN", - "MSG_HALT_WAVES", - "MSG_ORDERED_PS_DONE", - "MSG_EARLY_PRIM_DEALLOC", - "MSG_GS_ALLOC_REQ", - "MSG_GET_DOORBELL", - "MSG_GET_DDID", - nullptr, - nullptr, - nullptr, - "MSG_SYSMSG" +// Disable lint checking for this block since it makes the table unreadable. +// NOLINTBEGIN +const CustomOperand<const MCSubtargetInfo &> Msg[] = { + {{""}}, + {{"MSG_INTERRUPT"}, ID_INTERRUPT}, + {{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus}, + {{"MSG_GS_DONE"}, ID_GS_DONE_PreGFX11, isNotGFX11Plus}, + {{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10}, + {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus}, + {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus}, + {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9Plus}, + {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10}, + {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus}, + {{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10}, + {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, + {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, + {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, + {{""}}, + {{"MSG_SYSMSG"}, ID_SYSMSG}, + {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, + {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus}, + {{"MSG_RTN_GET_TMA"}, ID_RTN_GET_TMA, isGFX11Plus}, + {{"MSG_RTN_GET_REALTIME"}, ID_RTN_GET_REALTIME, isGFX11Plus}, + {{"MSG_RTN_SAVE_WAVE"}, ID_RTN_SAVE_WAVE, isGFX11Plus}, + {{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus}, }; +// NOLINTEND + +const int MSG_SIZE = static_cast<int>( + sizeof(Msg) / sizeof(CustomOperand<const MCSubtargetInfo &>)); // These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h. const char *const OpSysSymbolic[OP_SYS_LAST_] = { @@ -54,39 +85,54 @@ const char *const OpGsSymbolic[OP_GS_LAST_] = { namespace Hwreg { -// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h. -const char* const IdSymbolic[] = { - nullptr, - "HW_REG_MODE", - "HW_REG_STATUS", - "HW_REG_TRAPSTS", - "HW_REG_HW_ID", - "HW_REG_GPR_ALLOC", - "HW_REG_LDS_ALLOC", - "HW_REG_IB_STS", - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - "HW_REG_SH_MEM_BASES", - "HW_REG_TBA_LO", - "HW_REG_TBA_HI", - "HW_REG_TMA_LO", - "HW_REG_TMA_HI", - "HW_REG_FLAT_SCR_LO", - "HW_REG_FLAT_SCR_HI", - "HW_REG_XNACK_MASK", - "HW_REG_HW_ID1", - "HW_REG_HW_ID2", - "HW_REG_POPS_PACKER", - nullptr, - nullptr, - nullptr, - "HW_REG_SHADER_CYCLES" +// Disable lint checking for this block since it makes the table unreadable. +// NOLINTBEGIN +const CustomOperand<const MCSubtargetInfo &> Opr[] = { + {{""}}, + {{"HW_REG_MODE"}, ID_MODE}, + {{"HW_REG_STATUS"}, ID_STATUS}, + {{"HW_REG_TRAPSTS"}, ID_TRAPSTS}, + {{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus}, + {{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC}, + {{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC}, + {{"HW_REG_IB_STS"}, ID_IB_STS}, + {{""}}, + {{""}}, + {{""}}, + {{""}}, + {{""}}, + {{""}}, + {{""}}, + {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9Plus}, + {{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10}, + {{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10}, + {{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10}, + {{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10}, + {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10Plus}, + {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10Plus}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030}, + {{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus}, + {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, + {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, + {{""}}, + {{""}}, + {{""}}, + {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding}, + + // GFX940 specific registers + {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, + {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, + {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, + {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940}, + {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, + + // Aliases + {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, }; +// NOLINTEND + +const int OPR_SIZE = static_cast<int>( + sizeof(Opr) / sizeof(CustomOperand<const MCSubtargetInfo &>)); } // namespace Hwreg @@ -144,7 +190,7 @@ StringLiteral const NfmtSymbolicVI[] = { // VI and GFX9 "BUF_NUM_FORMAT_FLOAT" }; -StringLiteral const UfmtSymbolic[] = { +StringLiteral const UfmtSymbolicGFX10[] = { "BUF_FMT_INVALID", "BUF_FMT_8_UNORM", @@ -238,7 +284,7 @@ StringLiteral const UfmtSymbolic[] = { "BUF_FMT_32_32_32_32_FLOAT" }; -unsigned const DfmtNfmt2UFmt[] = { +unsigned const DfmtNfmt2UFmtGFX10[] = { DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT), DFMT_8 | (NFMT_UNORM << NFMT_SHIFT), @@ -332,6 +378,166 @@ unsigned const DfmtNfmt2UFmt[] = { DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT) }; +StringLiteral const UfmtSymbolicGFX11[] = { + "BUF_FMT_INVALID", + + "BUF_FMT_8_UNORM", + "BUF_FMT_8_SNORM", + "BUF_FMT_8_USCALED", + "BUF_FMT_8_SSCALED", + "BUF_FMT_8_UINT", + "BUF_FMT_8_SINT", + + "BUF_FMT_16_UNORM", + "BUF_FMT_16_SNORM", + "BUF_FMT_16_USCALED", + "BUF_FMT_16_SSCALED", + "BUF_FMT_16_UINT", + "BUF_FMT_16_SINT", + "BUF_FMT_16_FLOAT", + + "BUF_FMT_8_8_UNORM", + "BUF_FMT_8_8_SNORM", + "BUF_FMT_8_8_USCALED", + "BUF_FMT_8_8_SSCALED", + "BUF_FMT_8_8_UINT", + "BUF_FMT_8_8_SINT", + + "BUF_FMT_32_UINT", + "BUF_FMT_32_SINT", + "BUF_FMT_32_FLOAT", + + "BUF_FMT_16_16_UNORM", + "BUF_FMT_16_16_SNORM", + "BUF_FMT_16_16_USCALED", + "BUF_FMT_16_16_SSCALED", + "BUF_FMT_16_16_UINT", + "BUF_FMT_16_16_SINT", + "BUF_FMT_16_16_FLOAT", + + "BUF_FMT_10_11_11_FLOAT", + + "BUF_FMT_11_11_10_FLOAT", + + "BUF_FMT_10_10_10_2_UNORM", + "BUF_FMT_10_10_10_2_SNORM", + "BUF_FMT_10_10_10_2_UINT", + "BUF_FMT_10_10_10_2_SINT", + + "BUF_FMT_2_10_10_10_UNORM", + "BUF_FMT_2_10_10_10_SNORM", + "BUF_FMT_2_10_10_10_USCALED", + "BUF_FMT_2_10_10_10_SSCALED", + "BUF_FMT_2_10_10_10_UINT", + "BUF_FMT_2_10_10_10_SINT", + + "BUF_FMT_8_8_8_8_UNORM", + "BUF_FMT_8_8_8_8_SNORM", + "BUF_FMT_8_8_8_8_USCALED", + "BUF_FMT_8_8_8_8_SSCALED", + "BUF_FMT_8_8_8_8_UINT", + "BUF_FMT_8_8_8_8_SINT", + + "BUF_FMT_32_32_UINT", + "BUF_FMT_32_32_SINT", + "BUF_FMT_32_32_FLOAT", + + "BUF_FMT_16_16_16_16_UNORM", + "BUF_FMT_16_16_16_16_SNORM", + "BUF_FMT_16_16_16_16_USCALED", + "BUF_FMT_16_16_16_16_SSCALED", + "BUF_FMT_16_16_16_16_UINT", + "BUF_FMT_16_16_16_16_SINT", + "BUF_FMT_16_16_16_16_FLOAT", + + "BUF_FMT_32_32_32_UINT", + "BUF_FMT_32_32_32_SINT", + "BUF_FMT_32_32_32_FLOAT", + "BUF_FMT_32_32_32_32_UINT", + "BUF_FMT_32_32_32_32_SINT", + "BUF_FMT_32_32_32_32_FLOAT" +}; + +unsigned const DfmtNfmt2UFmtGFX11[] = { + DFMT_INVALID | (NFMT_UNORM << NFMT_SHIFT), + + DFMT_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_11_11 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_11_11_10 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_10_10_10_2 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_UINT << NFMT_SHIFT), + DFMT_10_10_10_2 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_2_10_10_10 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_UINT << NFMT_SHIFT), + DFMT_2_10_10_10 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_8_8_8_8 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_UINT << NFMT_SHIFT), + DFMT_8_8_8_8 | (NFMT_SINT << NFMT_SHIFT), + + DFMT_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_16_16_16_16 | (NFMT_UNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SNORM << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_USCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SSCALED << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_UINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_SINT << NFMT_SHIFT), + DFMT_16_16_16_16 | (NFMT_FLOAT << NFMT_SHIFT), + + DFMT_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_UINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_SINT << NFMT_SHIFT), + DFMT_32_32_32_32 | (NFMT_FLOAT << NFMT_SHIFT) +}; + } // namespace MTBUFFormat namespace Swizzle { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index d1deb570a938..054e35e90f2f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -11,15 +11,60 @@ #include "SIDefines.h" +#include "llvm/ADT/StringRef.h" + namespace llvm { class StringLiteral; +class MCSubtargetInfo; namespace AMDGPU { +const int OPR_ID_UNKNOWN = -1; +const int OPR_ID_UNSUPPORTED = -2; +const int OPR_ID_DUPLICATE = -3; +const int OPR_VAL_INVALID = -4; + +template <class T> struct CustomOperand { + StringLiteral Name; + int Encoding = 0; + bool (*Cond)(T Context) = nullptr; +}; + +struct CustomOperandVal { + StringLiteral Name; + unsigned Max; + unsigned Default; + unsigned Shift; + unsigned Width; + bool (*Cond)(const MCSubtargetInfo &STI) = nullptr; + unsigned Mask = (1 << Width) - 1; + + unsigned decode(unsigned Code) const { return (Code >> Shift) & Mask; } + + unsigned encode(unsigned Val) const { return (Val & Mask) << Shift; } + + unsigned getMask() const { return Mask << Shift; } + + bool isValid(unsigned Val) const { return Val <= Max; } + + bool isSupported(const MCSubtargetInfo &STI) const { + return !Cond || Cond(STI); + } +}; + +namespace DepCtr { + +extern const CustomOperandVal DepCtrInfo[]; +extern const int DEP_CTR_SIZE; + +} // namespace DepCtr + namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. -extern const char *const IdSymbolic[ID_GAPS_LAST_]; +extern const CustomOperand<const MCSubtargetInfo &> Msg[]; +extern const int MSG_SIZE; + extern const char *const OpSysSymbolic[OP_SYS_LAST_]; extern const char *const OpGsSymbolic[OP_GS_LAST_]; @@ -27,7 +72,8 @@ extern const char *const OpGsSymbolic[OP_GS_LAST_]; namespace Hwreg { // Symbolic names for the hwreg(...) syntax. -extern const char* const IdSymbolic[]; +extern const CustomOperand<const MCSubtargetInfo &> Opr[]; +extern const int OPR_SIZE; } // namespace Hwreg @@ -37,8 +83,10 @@ extern StringLiteral const DfmtSymbolic[]; extern StringLiteral const NfmtSymbolicGFX10[]; extern StringLiteral const NfmtSymbolicSICI[]; extern StringLiteral const NfmtSymbolicVI[]; -extern StringLiteral const UfmtSymbolic[]; -extern unsigned const DfmtNfmt2UFmt[]; +extern StringLiteral const UfmtSymbolicGFX10[]; +extern StringLiteral const UfmtSymbolicGFX11[]; +extern unsigned const DfmtNfmt2UFmtGFX10[]; +extern unsigned const DfmtNfmt2UFmtGFX11[]; } // namespace MTBUFFormat diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 683be871ff82..e4ab72f1095b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -28,10 +28,15 @@ #define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" -static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion( - "amdhsa-code-object-version", llvm::cl::Hidden, - llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4), - llvm::cl::ZeroOrMore); +static llvm::cl::opt<unsigned> + AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden, + llvm::cl::desc("AMDHSA Code Object Version"), + llvm::cl::init(4)); + +// TODO-GFX11: Remove this when full 16-bit codegen is implemented. +static llvm::cl::opt<bool> + LimitTo128VGPRs("amdgpu-limit-to-128-vgprs", llvm::cl::Hidden, + llvm::cl::desc("Never use more than 128 VGPRs")); namespace { @@ -44,9 +49,8 @@ unsigned getBitMask(unsigned Shift, unsigned Width) { /// /// \returns Packed \p Dst. unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) { - Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width); - Dst |= (Src << Shift) & getBitMask(Shift, Width); - return Dst; + unsigned Mask = getBitMask(Shift, Width); + return ((Src << Shift) & Mask) | (Dst & ~Mask); } /// Unpacks bits from \p Src for given bit \p Shift and bit \p Width. @@ -57,30 +61,40 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { } /// \returns Vmcnt bit shift (lower bits). -unsigned getVmcntBitShiftLo() { return 0; } +unsigned getVmcntBitShiftLo(unsigned VersionMajor) { + return VersionMajor >= 11 ? 10 : 0; +} /// \returns Vmcnt bit width (lower bits). -unsigned getVmcntBitWidthLo() { return 4; } +unsigned getVmcntBitWidthLo(unsigned VersionMajor) { + return VersionMajor >= 11 ? 6 : 4; +} /// \returns Expcnt bit shift. -unsigned getExpcntBitShift() { return 4; } +unsigned getExpcntBitShift(unsigned VersionMajor) { + return VersionMajor >= 11 ? 0 : 4; +} /// \returns Expcnt bit width. -unsigned getExpcntBitWidth() { return 3; } +unsigned getExpcntBitWidth(unsigned VersionMajor) { return 3; } /// \returns Lgkmcnt bit shift. -unsigned getLgkmcntBitShift() { return 8; } +unsigned getLgkmcntBitShift(unsigned VersionMajor) { + return VersionMajor >= 11 ? 4 : 8; +} /// \returns Lgkmcnt bit width. unsigned getLgkmcntBitWidth(unsigned VersionMajor) { - return (VersionMajor >= 10) ? 6 : 4; + return VersionMajor >= 10 ? 6 : 4; } /// \returns Vmcnt bit shift (higher bits). -unsigned getVmcntBitShiftHi() { return 14; } +unsigned getVmcntBitShiftHi(unsigned VersionMajor) { return 14; } /// \returns Vmcnt bit width (higher bits). -unsigned getVmcntBitWidthHi() { return 2; } +unsigned getVmcntBitWidthHi(unsigned VersionMajor) { + return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0; +} } // end namespace anonymous @@ -136,6 +150,41 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) { isHsaAbiVersion5(STI); } +unsigned getAmdhsaCodeObjectVersion() { + return AmdhsaCodeObjectVersion; +} + +unsigned getMultigridSyncArgImplicitArgPosition() { + switch (AmdhsaCodeObjectVersion) { + case 2: + case 3: + case 4: + return 48; + case 5: + return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET; + default: + llvm_unreachable("Unexpected code object version"); + return 0; + } +} + + +// FIXME: All such magic numbers about the ABI should be in a +// central TD file. +unsigned getHostcallImplicitArgPosition() { + switch (AmdhsaCodeObjectVersion) { + case 2: + case 3: + case 4: + return 24; + case 5: + return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET; + default: + llvm_unreachable("Unexpected code object version"); + return 0; + } +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL @@ -144,6 +193,7 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) { #define GET_MIMGBiasMappingTable_IMPL #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL +#define GET_MAIInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -223,6 +273,10 @@ struct VOPInfo { bool IsSingle; }; +struct VOPC64DPPInfo { + uint16_t Opcode; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -235,6 +289,14 @@ struct VOPInfo { #define GET_VOP2InfoTable_IMPL #define GET_VOP3InfoTable_DECL #define GET_VOP3InfoTable_IMPL +#define GET_VOPC64DPPTable_DECL +#define GET_VOPC64DPPTable_IMPL +#define GET_VOPC64DPP8Table_DECL +#define GET_VOPC64DPP8Table_IMPL +#define GET_WMMAOpcode2AddrMappingTable_DECL +#define GET_WMMAOpcode2AddrMappingTable_IMPL +#define GET_WMMAOpcode3AddrMappingTable_DECL +#define GET_WMMAOpcode3AddrMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -322,6 +384,30 @@ bool getVOP3IsSingle(unsigned Opc) { return Info ? Info->IsSingle : false; } +bool isVOPC64DPP(unsigned Opc) { + return isVOPC64DPPOpcodeHelper(Opc) || isVOPC64DPP8OpcodeHelper(Opc); +} + +bool getMAIIsDGEMM(unsigned Opc) { + const MAIInstInfo *Info = getMAIInstInfoHelper(Opc); + return Info ? Info->is_dgemm : false; +} + +bool getMAIIsGFX940XDL(unsigned Opc) { + const MAIInstInfo *Info = getMAIInstInfoHelper(Opc); + return Info ? Info->is_gfx940_xdl : false; +} + +unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { + const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); + return Info ? Info->Opcode3Addr : ~0u; +} + +unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc) { + const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom3AddrOpcode(Opc); + return Info ? Info->Opcode2Addr : ~0u; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -740,6 +826,15 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { + if (LimitTo128VGPRs.getNumOccurrences() ? LimitTo128VGPRs + : isGFX11Plus(*STI)) { + // GFX11 changes the encoding of 16-bit operands in VOP1/2/C instructions + // such that values 128..255 no longer mean v128..v255, they mean + // v0.hi..v127.hi instead. Until the compiler understands this, it is not + // safe to use v128..v255. + // TODO-GFX11: Remove this when full 16-bit codegen is implemented. + return 128; + } if (STI->getFeatureBits().test(FeatureGFX90AInsts)) return 512; return 256; @@ -904,16 +999,13 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, } unsigned getVmcntBitMask(const IsaVersion &Version) { - unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1; - if (Version.Major < 9) - return VmcntLo; - - unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo(); - return VmcntLo | VmcntHi; + return (1 << (getVmcntBitWidthLo(Version.Major) + + getVmcntBitWidthHi(Version.Major))) - + 1; } unsigned getExpcntBitMask(const IsaVersion &Version) { - return (1 << getExpcntBitWidth()) - 1; + return (1 << getExpcntBitWidth(Version.Major)) - 1; } unsigned getLgkmcntBitMask(const IsaVersion &Version) { @@ -921,36 +1013,32 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) { } unsigned getWaitcntBitMask(const IsaVersion &Version) { - unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); - unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); - unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), + unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), + getVmcntBitWidthLo(Version.Major)); + unsigned Expcnt = getBitMask(getExpcntBitShift(Version.Major), + getExpcntBitWidth(Version.Major)); + unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(Version.Major), getLgkmcntBitWidth(Version.Major)); - unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt; - if (Version.Major < 9) - return Waitcnt; - - unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi()); - return Waitcnt | VmcntHi; + unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(Version.Major), + getVmcntBitWidthHi(Version.Major)); + return VmcntLo | Expcnt | Lgkmcnt | VmcntHi; } unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) { - unsigned VmcntLo = - unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); - if (Version.Major < 9) - return VmcntLo; - - unsigned VmcntHi = - unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); - VmcntHi <<= getVmcntBitWidthLo(); - return VmcntLo | VmcntHi; + unsigned VmcntLo = unpackBits(Waitcnt, getVmcntBitShiftLo(Version.Major), + getVmcntBitWidthLo(Version.Major)); + unsigned VmcntHi = unpackBits(Waitcnt, getVmcntBitShiftHi(Version.Major), + getVmcntBitWidthHi(Version.Major)); + return VmcntLo | VmcntHi << getVmcntBitWidthLo(Version.Major); } unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); + return unpackBits(Waitcnt, getExpcntBitShift(Version.Major), + getExpcntBitWidth(Version.Major)); } unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) { - return unpackBits(Waitcnt, getLgkmcntBitShift(), + return unpackBits(Waitcnt, getLgkmcntBitShift(Version.Major), getLgkmcntBitWidth(Version.Major)); } @@ -971,24 +1059,23 @@ Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt) { - Waitcnt = - packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); - if (Version.Major < 9) - return Waitcnt; - - Vmcnt >>= getVmcntBitWidthLo(); - return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); + Waitcnt = packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(Version.Major), + getVmcntBitWidthLo(Version.Major)); + return packBits(Vmcnt >> getVmcntBitWidthLo(Version.Major), Waitcnt, + getVmcntBitShiftHi(Version.Major), + getVmcntBitWidthHi(Version.Major)); } unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Expcnt) { - return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); + return packBits(Expcnt, Waitcnt, getExpcntBitShift(Version.Major), + getExpcntBitWidth(Version.Major)); } unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt) { - return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), - getLgkmcntBitWidth(Version.Major)); + return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(Version.Major), + getLgkmcntBitWidth(Version.Major)); } unsigned encodeWaitcnt(const IsaVersion &Version, @@ -1005,43 +1092,184 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { } //===----------------------------------------------------------------------===// -// hwreg +// Custom Operands. +// +// A table of custom operands shall describe "primary" operand names +// first followed by aliases if any. It is not required but recommended +// to arrange operands so that operand encoding match operand position +// in the table. This will make disassembly a bit more efficient. +// Unused slots in the table shall have an empty name. +// //===----------------------------------------------------------------------===// -namespace Hwreg { +template <class T> +static bool isValidOpr(int Idx, const CustomOperand<T> OpInfo[], int OpInfoSize, + T Context) { + return 0 <= Idx && Idx < OpInfoSize && !OpInfo[Idx].Name.empty() && + (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context)); +} -int64_t getHwregId(const StringRef Name) { - for (int Id = ID_SYMBOLIC_FIRST_; Id < ID_SYMBOLIC_LAST_; ++Id) { - if (IdSymbolic[Id] && Name == IdSymbolic[Id]) - return Id; +template <class T> +static int getOprIdx(std::function<bool(const CustomOperand<T> &)> Test, + const CustomOperand<T> OpInfo[], int OpInfoSize, + T Context) { + int InvalidIdx = OPR_ID_UNKNOWN; + for (int Idx = 0; Idx < OpInfoSize; ++Idx) { + if (Test(OpInfo[Idx])) { + if (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context)) + return Idx; + InvalidIdx = OPR_ID_UNSUPPORTED; + } } - return ID_UNKNOWN_; + return InvalidIdx; } -static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { - if (isSI(STI) || isCI(STI) || isVI(STI)) - return ID_SYMBOLIC_FIRST_GFX9_; - else if (isGFX9(STI)) - return ID_SYMBOLIC_FIRST_GFX10_; - else if (isGFX10(STI) && !isGFX10_BEncoding(STI)) - return ID_SYMBOLIC_FIRST_GFX1030_; - else - return ID_SYMBOLIC_LAST_; +template <class T> +static int getOprIdx(const StringRef Name, const CustomOperand<T> OpInfo[], + int OpInfoSize, T Context) { + auto Test = [=](const CustomOperand<T> &Op) { return Op.Name == Name; }; + return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context); } -bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { - switch (Id) { - case ID_HW_ID: - return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI); - case ID_HW_ID1: - case ID_HW_ID2: - return isGFX10Plus(STI); - case ID_XNACK_MASK: - return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI); - default: - return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && - IdSymbolic[Id]; +template <class T> +static int getOprIdx(int Id, const CustomOperand<T> OpInfo[], int OpInfoSize, + T Context, bool QuickCheck = true) { + auto Test = [=](const CustomOperand<T> &Op) { + return Op.Encoding == Id && !Op.Name.empty(); + }; + // This is an optimization that should work in most cases. + // As a side effect, it may cause selection of an alias + // instead of a primary operand name in case of sparse tables. + if (QuickCheck && isValidOpr<T>(Id, OpInfo, OpInfoSize, Context) && + OpInfo[Id].Encoding == Id) { + return Id; + } + return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context); +} + +//===----------------------------------------------------------------------===// +// Custom Operand Values +//===----------------------------------------------------------------------===// + +static unsigned getDefaultCustomOperandEncoding(const CustomOperandVal *Opr, + int Size, + const MCSubtargetInfo &STI) { + unsigned Enc = 0; + for (int Idx = 0; Idx < Size; ++Idx) { + const auto &Op = Opr[Idx]; + if (Op.isSupported(STI)) + Enc |= Op.encode(Op.Default); + } + return Enc; +} + +static bool isSymbolicCustomOperandEncoding(const CustomOperandVal *Opr, + int Size, unsigned Code, + bool &HasNonDefaultVal, + const MCSubtargetInfo &STI) { + unsigned UsedOprMask = 0; + HasNonDefaultVal = false; + for (int Idx = 0; Idx < Size; ++Idx) { + const auto &Op = Opr[Idx]; + if (!Op.isSupported(STI)) + continue; + UsedOprMask |= Op.getMask(); + unsigned Val = Op.decode(Code); + if (!Op.isValid(Val)) + return false; + HasNonDefaultVal |= (Val != Op.Default); } + return (Code & ~UsedOprMask) == 0; +} + +static bool decodeCustomOperand(const CustomOperandVal *Opr, int Size, + unsigned Code, int &Idx, StringRef &Name, + unsigned &Val, bool &IsDefault, + const MCSubtargetInfo &STI) { + while (Idx < Size) { + const auto &Op = Opr[Idx++]; + if (Op.isSupported(STI)) { + Name = Op.Name; + Val = Op.decode(Code); + IsDefault = (Val == Op.Default); + return true; + } + } + + return false; +} + +static int encodeCustomOperandVal(const CustomOperandVal &Op, + int64_t InputVal) { + if (InputVal < 0 || InputVal > Op.Max) + return OPR_VAL_INVALID; + return Op.encode(InputVal); +} + +static int encodeCustomOperand(const CustomOperandVal *Opr, int Size, + const StringRef Name, int64_t InputVal, + unsigned &UsedOprMask, + const MCSubtargetInfo &STI) { + int InvalidId = OPR_ID_UNKNOWN; + for (int Idx = 0; Idx < Size; ++Idx) { + const auto &Op = Opr[Idx]; + if (Op.Name == Name) { + if (!Op.isSupported(STI)) { + InvalidId = OPR_ID_UNSUPPORTED; + continue; + } + auto OprMask = Op.getMask(); + if (OprMask & UsedOprMask) + return OPR_ID_DUPLICATE; + UsedOprMask |= OprMask; + return encodeCustomOperandVal(Op, InputVal); + } + } + return InvalidId; +} + +//===----------------------------------------------------------------------===// +// DepCtr +//===----------------------------------------------------------------------===// + +namespace DepCtr { + +int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI) { + static int Default = -1; + if (Default == -1) + Default = getDefaultCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, STI); + return Default; +} + +bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal, + const MCSubtargetInfo &STI) { + return isSymbolicCustomOperandEncoding(DepCtrInfo, DEP_CTR_SIZE, Code, + HasNonDefaultVal, STI); +} + +bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val, + bool &IsDefault, const MCSubtargetInfo &STI) { + return decodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Code, Id, Name, Val, + IsDefault, STI); +} + +int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask, + const MCSubtargetInfo &STI) { + return encodeCustomOperand(DepCtrInfo, DEP_CTR_SIZE, Name, Val, UsedOprMask, + STI); +} + +} // namespace DepCtr + +//===----------------------------------------------------------------------===// +// hwreg +//===----------------------------------------------------------------------===// + +namespace Hwreg { + +int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) { + int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Opr, OPR_SIZE, STI); + return (Idx < 0) ? Idx : Opr[Idx].Encoding; } bool isValidHwreg(int64_t Id) { @@ -1063,7 +1291,8 @@ uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { } StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) { - return isValidHwreg(Id, STI) ? IdSymbolic[Id] : ""; + int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI); + return (Idx < 0) ? "" : Opr[Idx].Name; } void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) { @@ -1087,12 +1316,13 @@ struct ExpTgt { }; static constexpr ExpTgt ExpTgtInfo[] = { - {{"null"}, ET_NULL, ET_NULL_MAX_IDX}, - {{"mrtz"}, ET_MRTZ, ET_MRTZ_MAX_IDX}, - {{"prim"}, ET_PRIM, ET_PRIM_MAX_IDX}, - {{"mrt"}, ET_MRT0, ET_MRT_MAX_IDX}, - {{"pos"}, ET_POS0, ET_POS_MAX_IDX}, - {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX}, + {{"null"}, ET_NULL, ET_NULL_MAX_IDX}, + {{"mrtz"}, ET_MRTZ, ET_MRTZ_MAX_IDX}, + {{"prim"}, ET_PRIM, ET_PRIM_MAX_IDX}, + {{"mrt"}, ET_MRT0, ET_MRT_MAX_IDX}, + {{"pos"}, ET_POS0, ET_POS_MAX_IDX}, + {{"dual_src_blend"}, ET_DUAL_SRC_BLEND0, ET_DUAL_SRC_BLEND_MAX_IDX}, + {{"param"}, ET_PARAM0, ET_PARAM_MAX_IDX}, }; bool getTgtName(unsigned Id, StringRef &Name, int &Index) { @@ -1130,7 +1360,20 @@ unsigned getTgtId(const StringRef Name) { } bool isSupportedTgtId(unsigned Id, const MCSubtargetInfo &STI) { - return (Id != ET_POS4 && Id != ET_PRIM) || isGFX10Plus(STI); + switch (Id) { + case ET_NULL: + return !isGFX11Plus(STI); + case ET_POS4: + case ET_PRIM: + return isGFX10Plus(STI); + case ET_DUAL_SRC_BLEND0: + case ET_DUAL_SRC_BLEND1: + return isGFX11Plus(STI); + default: + if (Id >= ET_PARAM0 && Id <= ET_PARAM31) + return !isGFX11Plus(STI); + return true; + } } } // namespace Exp @@ -1196,27 +1439,44 @@ void decodeDfmtNfmt(unsigned Format, unsigned &Dfmt, unsigned &Nfmt) { Nfmt = (Format >> NFMT_SHIFT) & NFMT_MASK; } -int64_t getUnifiedFormat(const StringRef Name) { - for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { - if (Name == UfmtSymbolic[Id]) - return Id; +int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI) { + if (isGFX11Plus(STI)) { + for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) { + if (Name == UfmtSymbolicGFX11[Id]) + return Id; + } + } else { + for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) { + if (Name == UfmtSymbolicGFX10[Id]) + return Id; + } } return UFMT_UNDEF; } -StringRef getUnifiedFormatName(unsigned Id) { - return isValidUnifiedFormat(Id) ? UfmtSymbolic[Id] : ""; +StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI) { + if(isValidUnifiedFormat(Id, STI)) + return isGFX10(STI) ? UfmtSymbolicGFX10[Id] : UfmtSymbolicGFX11[Id]; + return ""; } -bool isValidUnifiedFormat(unsigned Id) { - return Id <= UFMT_LAST; +bool isValidUnifiedFormat(unsigned Id, const MCSubtargetInfo &STI) { + return isGFX10(STI) ? Id <= UfmtGFX10::UFMT_LAST : Id <= UfmtGFX11::UFMT_LAST; } -int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt) { +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt, + const MCSubtargetInfo &STI) { int64_t Fmt = encodeDfmtNfmt(Dfmt, Nfmt); - for (int Id = UFMT_FIRST; Id <= UFMT_LAST; ++Id) { - if (Fmt == DfmtNfmt2UFmt[Id]) - return Id; + if (isGFX11Plus(STI)) { + for (int Id = UfmtGFX11::UFMT_FIRST; Id <= UfmtGFX11::UFMT_LAST; ++Id) { + if (Fmt == DfmtNfmt2UFmtGFX11[Id]) + return Id; + } + } else { + for (int Id = UfmtGFX10::UFMT_FIRST; Id <= UfmtGFX10::UFMT_LAST; ++Id) { + if (Fmt == DfmtNfmt2UFmtGFX10[Id]) + return Id; + } } return UFMT_UNDEF; } @@ -1239,40 +1499,22 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI) { namespace SendMsg { -int64_t getMsgId(const StringRef Name) { - for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { - if (IdSymbolic[i] && Name == IdSymbolic[i]) - return i; - } - return ID_UNKNOWN_; +static uint64_t getMsgIdMask(const MCSubtargetInfo &STI) { + return isGFX11Plus(STI) ? ID_MASK_GFX11Plus_ : ID_MASK_PreGFX11_; } -bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict) { - if (Strict) { - switch (MsgId) { - case ID_SAVEWAVE: - return isVI(STI) || isGFX9Plus(STI); - case ID_STALL_WAVE_GEN: - case ID_HALT_WAVES: - case ID_ORDERED_PS_DONE: - case ID_GS_ALLOC_REQ: - case ID_GET_DOORBELL: - return isGFX9Plus(STI); - case ID_EARLY_PRIM_DEALLOC: - return isGFX9(STI); - case ID_GET_DDID: - return isGFX10Plus(STI); - default: - return 0 <= MsgId && MsgId < ID_GAPS_LAST_ && IdSymbolic[MsgId]; - } - } else { - return 0 <= MsgId && isUInt<ID_WIDTH_>(MsgId); - } +int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI) { + int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Msg, MSG_SIZE, STI); + return (Idx < 0) ? Idx : Msg[Idx].Encoding; +} + +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI) { + return (MsgId & ~(getMsgIdMask(STI))) == 0; } -StringRef getMsgName(int64_t MsgId) { - assert(0 <= MsgId && MsgId < ID_GAPS_LAST_); - return IdSymbolic[MsgId]; +StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI) { + int Idx = getOprIdx<const MCSubtargetInfo &>(MsgId, Msg, MSG_SIZE, STI); + return (Idx < 0) ? "" : Msg[Idx].Name; } int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { @@ -1289,26 +1531,27 @@ int64_t getMsgOpId(int64_t MsgId, const StringRef Name) { bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, bool Strict) { - assert(isValidMsgId(MsgId, STI, Strict)); + assert(isValidMsgId(MsgId, STI)); if (!Strict) return 0 <= OpId && isUInt<OP_WIDTH_>(OpId); - switch(MsgId) - { - case ID_GS: - return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP; - case ID_GS_DONE: - return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_; - case ID_SYSMSG: + if (MsgId == ID_SYSMSG) return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_; - default: - return OpId == OP_NONE_; + if (!isGFX11Plus(STI)) { + switch (MsgId) { + case ID_GS_PreGFX11: + return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP; + case ID_GS_DONE_PreGFX11: + return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_; + } } + return OpId == OP_NONE_; } -StringRef getMsgOpName(int64_t MsgId, int64_t OpId) { - assert(msgRequiresOp(MsgId)); +StringRef getMsgOpName(int64_t MsgId, int64_t OpId, + const MCSubtargetInfo &STI) { + assert(msgRequiresOp(MsgId, STI)); return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId]; } @@ -1319,42 +1562,48 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, if (!Strict) return 0 <= StreamId && isUInt<STREAM_ID_WIDTH_>(StreamId); - switch(MsgId) - { - case ID_GS: - return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_; - case ID_GS_DONE: - return (OpId == OP_GS_NOP)? - (StreamId == STREAM_ID_NONE_) : - (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_); - default: - return StreamId == STREAM_ID_NONE_; + if (!isGFX11Plus(STI)) { + switch (MsgId) { + case ID_GS_PreGFX11: + return STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_; + case ID_GS_DONE_PreGFX11: + return (OpId == OP_GS_NOP) ? + (StreamId == STREAM_ID_NONE_) : + (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_); + } } + return StreamId == STREAM_ID_NONE_; } -bool msgRequiresOp(int64_t MsgId) { - return MsgId == ID_GS || MsgId == ID_GS_DONE || MsgId == ID_SYSMSG; +bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI) { + return MsgId == ID_SYSMSG || + (!isGFX11Plus(STI) && + (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11)); } -bool msgSupportsStream(int64_t MsgId, int64_t OpId) { - return (MsgId == ID_GS || MsgId == ID_GS_DONE) && OpId != OP_GS_NOP; +bool msgSupportsStream(int64_t MsgId, int64_t OpId, + const MCSubtargetInfo &STI) { + return !isGFX11Plus(STI) && + (MsgId == ID_GS_PreGFX11 || MsgId == ID_GS_DONE_PreGFX11) && + OpId != OP_GS_NOP; } -void decodeMsg(unsigned Val, - uint16_t &MsgId, - uint16_t &OpId, - uint16_t &StreamId) { - MsgId = Val & ID_MASK_; - OpId = (Val & OP_MASK_) >> OP_SHIFT_; - StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; +void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId, + uint16_t &StreamId, const MCSubtargetInfo &STI) { + MsgId = Val & getMsgIdMask(STI); + if (isGFX11Plus(STI)) { + OpId = 0; + StreamId = 0; + } else { + OpId = (Val & OP_MASK_) >> OP_SHIFT_; + StreamId = (Val & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; + } } uint64_t encodeMsg(uint64_t MsgId, uint64_t OpId, uint64_t StreamId) { - return (MsgId << ID_SHIFT_) | - (OpId << OP_SHIFT_) | - (StreamId << STREAM_ID_SHIFT_); + return MsgId | (OpId << OP_SHIFT_) | (StreamId << STREAM_ID_SHIFT_); } } // namespace SendMsg @@ -1427,6 +1676,10 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC) { } } +bool isKernelCC(const Function *Func) { + return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); +} + bool hasXNACK(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureXNACK]; } @@ -1448,7 +1701,8 @@ bool hasG16(const MCSubtargetInfo &STI) { } bool hasPackedD16(const MCSubtargetInfo &STI) { - return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]; + return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem] && !isCI(STI) && + !isSI(STI); } bool isSI(const MCSubtargetInfo &STI) { @@ -1467,6 +1721,18 @@ bool isGFX9(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGFX9_GFX10(const MCSubtargetInfo &STI) { + return isGFX9(STI) || isGFX10(STI); +} + +bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI) { + return isVI(STI) || isGFX9(STI) || isGFX10(STI); +} + +bool isGFX8Plus(const MCSubtargetInfo &STI) { + return isVI(STI) || isGFX9Plus(STI); +} + bool isGFX9Plus(const MCSubtargetInfo &STI) { return isGFX9(STI) || isGFX10Plus(STI); } @@ -1475,7 +1741,29 @@ bool isGFX10(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; } -bool isGFX10Plus(const MCSubtargetInfo &STI) { return isGFX10(STI); } +bool isGFX10Plus(const MCSubtargetInfo &STI) { + return isGFX10(STI) || isGFX11Plus(STI); +} + +bool isGFX11(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX11]; +} + +bool isGFX11Plus(const MCSubtargetInfo &STI) { + return isGFX11(STI); +} + +bool isNotGFX11Plus(const MCSubtargetInfo &STI) { + return !isGFX11Plus(STI); +} + +bool isNotGFX10Plus(const MCSubtargetInfo &STI) { + return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI); +} + +bool isGFX10Before1030(const MCSubtargetInfo &STI) { + return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI); +} bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; @@ -1497,10 +1785,29 @@ bool isGFX90A(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; } +bool isGFX940(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts]; +} + bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; } +bool hasMAIInsts(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts]; +} + +bool hasVOPD(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureVOPD]; +} + +int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, + int32_t ArgNumVGPR) { + if (has90AInsts && ArgNumAGPR) + return alignTo(ArgNumVGPR, 4) + ArgNumAGPR; + return std::max(ArgNumVGPR, ArgNumAGPR); +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); @@ -1508,13 +1815,6 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { Reg == AMDGPU::SCC; } -bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { - for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) { - if (*R == Reg1) return true; - } - return false; -} - #define MAP_REG2REG \ using namespace AMDGPU; \ switch(Reg) { \ @@ -1554,6 +1854,9 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_VI_GFX9PLUS(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ CASE_VI_GFX9PLUS(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ CASE_VI_GFX9PLUS(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_GFXPRE11_GFX11PLUS(M0) \ + CASE_GFXPRE11_GFX11PLUS(SGPR_NULL) \ + CASE_GFXPRE11_GFX11PLUS_TO(SGPR_NULL64, SGPR_NULL) \ } #define CASE_CI_VI(node) \ @@ -1563,6 +1866,12 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { #define CASE_VI_GFX9PLUS(node) \ case node: return isGFX9Plus(STI) ? node##_gfx9plus : node##_vi; +#define CASE_GFXPRE11_GFX11PLUS(node) \ + case node: return isGFX11Plus(STI) ? node##_gfx11plus : node##_gfxpre11; + +#define CASE_GFXPRE11_GFX11PLUS_TO(node, result) \ + case node: return isGFX11Plus(STI) ? result##_gfx11plus : result##_gfxpre11; + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) return Reg; @@ -1571,9 +1880,13 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { #undef CASE_CI_VI #undef CASE_VI_GFX9PLUS +#undef CASE_GFXPRE11_GFX11PLUS +#undef CASE_GFXPRE11_GFX11PLUS_TO #define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; #define CASE_VI_GFX9PLUS(node) case node##_vi: case node##_gfx9plus: return node; +#define CASE_GFXPRE11_GFX11PLUS(node) case node##_gfx11plus: case node##_gfxpre11: return node; +#define CASE_GFXPRE11_GFX11PLUS_TO(node, result) unsigned mc2PseudoReg(unsigned Reg) { MAP_REG2REG @@ -1581,6 +1894,8 @@ unsigned mc2PseudoReg(unsigned Reg) { #undef CASE_CI_VI #undef CASE_VI_GFX9PLUS +#undef CASE_GFXPRE11_GFX11PLUS +#undef CASE_GFXPRE11_GFX11PLUS_TO #undef MAP_REG2REG bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -1934,7 +2249,7 @@ Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, } unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST, bool Signed) { - // Address offset is 12-bit signed for GFX10, 13-bit for GFX9. + // Address offset is 12-bit signed for GFX10, 13-bit for GFX9 and GFX11+. if (AMDGPU::isGFX10(ST)) return Signed ? 12 : 11; @@ -2029,7 +2344,8 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); #define GET_SourcesOfDivergence_IMPL #define GET_Gfx9BufferFormat_IMPL -#define GET_Gfx10PlusBufferFormat_IMPL +#define GET_Gfx10BufferFormat_IMPL +#define GET_Gfx11PlusBufferFormat_IMPL #include "AMDGPUGenSearchableTables.inc" } // end anonymous namespace @@ -2042,16 +2358,20 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI) { - return isGFX10Plus(STI) - ? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents, + return isGFX11Plus(STI) + ? getGfx11PlusBufferFormatInfo(BitsPerComp, NumComponents, NumFormat) - : getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat); + : isGFX10(STI) ? getGfx10BufferFormatInfo(BitsPerComp, + NumComponents, NumFormat) + : getGfx9BufferFormatInfo(BitsPerComp, + NumComponents, NumFormat); } const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, const MCSubtargetInfo &STI) { - return isGFX10Plus(STI) ? getGfx10PlusBufferFormatInfo(Format) - : getGfx9BufferFormatInfo(Format); + return isGFX11Plus(STI) ? getGfx11PlusBufferFormatInfo(Format) + : isGFX10(STI) ? getGfx10BufferFormatInfo(Format) + : getGfx9BufferFormatInfo(Format); } } // namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 4516b511f3c8..dffeec10a14a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -50,10 +50,19 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 5, /// false otherwise. bool isHsaAbiVersion5(const MCSubtargetInfo *STI); -/// \returns True if HSA OS ABI Version identification is 3 or 4, +/// \returns True if HSA OS ABI Version identification is 3 and above, /// false otherwise. bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI); +/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr +unsigned getMultigridSyncArgImplicitArgPosition(); + +/// \returns The offset of the hostcall pointer argument from implicitarg_ptr +unsigned getHostcallImplicitArgPosition(); + +/// \returns Code object version. +unsigned getAmdhsaCodeObjectVersion(); + struct GcnBufferFormatInfo { unsigned Format; unsigned BitsPerComp; @@ -62,12 +71,19 @@ struct GcnBufferFormatInfo { unsigned DataFormat; }; +struct MAIInstInfo { + uint16_t Opcode; + bool is_dgemm; + bool is_gfx940_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL #define GET_MIMGMIPMapping_DECL #define GET_MIMGBiASMapping_DECL +#define GET_MAIInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -352,6 +368,11 @@ struct MIMGG16MappingInfo { LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); +struct WMMAOpcodeMappingInfo { + unsigned Opcode2Addr; + unsigned Opcode3Addr; +}; + LLVM_READONLY const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); @@ -382,6 +403,7 @@ struct MIMGInfo { uint8_t MIMGEncoding; uint8_t VDataDwords; uint8_t VAddrDwords; + uint8_t VAddrOperands; }; LLVM_READONLY @@ -439,6 +461,16 @@ LLVM_READONLY bool getVOP3IsSingle(unsigned Opc); LLVM_READONLY +bool isVOPC64DPP(unsigned Opc); + +/// Returns true if MAI operation is a double precision GEMM. +LLVM_READONLY +bool getMAIIsDGEMM(unsigned Opc); + +LLVM_READONLY +bool getMAIIsGFX940XDL(unsigned Opc); + +LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, @@ -450,6 +482,12 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); +LLVM_READONLY +unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); + +LLVM_READONLY +unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); @@ -496,7 +534,7 @@ struct Waitcnt { unsigned LgkmCnt = ~0u; unsigned VsCnt = ~0u; - Waitcnt() {} + Waitcnt() = default; Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt) : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {} @@ -555,11 +593,14 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// \p Lgkmcnt respectively. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: -/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only) -/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) -/// \p Expcnt = \p Waitcnt[6:4] -/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10 only) -/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10+ only) +/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9) +/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10) +/// \p Vmcnt = \p Waitcnt[15:10] (gfx11+) +/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11) +/// \p Expcnt = \p Waitcnt[2:0] (gfx11+) +/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10) +/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10) +/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11+) void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); @@ -581,12 +622,15 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, /// \p Version. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: -/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only) -/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only) -/// Waitcnt[6:4] = \p Expcnt -/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10 only) -/// Waitcnt[13:8] = \p Lgkmcnt (gfx10+ only) -/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only) +/// Waitcnt[2:0] = \p Expcnt (gfx11+) +/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9) +/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9,10) +/// Waitcnt[6:4] = \p Expcnt (pre-gfx11) +/// Waitcnt[9:4] = \p Lgkmcnt (gfx11+) +/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10) +/// Waitcnt[13:8] = \p Lgkmcnt (gfx10) +/// Waitcnt[15:10] = \p Vmcnt (gfx11+) +/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9,10) /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. @@ -598,10 +642,7 @@ unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); namespace Hwreg { LLVM_READONLY -int64_t getHwregId(const StringRef Name); - -LLVM_READNONE -bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI); +int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI); LLVM_READNONE bool isValidHwreg(int64_t Id); @@ -622,6 +663,18 @@ void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width); } // namespace Hwreg +namespace DepCtr { + +int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI); +int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask, + const MCSubtargetInfo &STI); +bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal, + const MCSubtargetInfo &STI); +bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val, + bool &IsDefault, const MCSubtargetInfo &STI); + +} // namespace DepCtr + namespace Exp { bool getTgtName(unsigned Id, StringRef &Name, int &Index); @@ -653,13 +706,14 @@ bool isValidDfmtNfmt(unsigned Val, const MCSubtargetInfo &STI); bool isValidNfmt(unsigned Val, const MCSubtargetInfo &STI); -int64_t getUnifiedFormat(const StringRef Name); +int64_t getUnifiedFormat(const StringRef Name, const MCSubtargetInfo &STI); -StringRef getUnifiedFormatName(unsigned Id); +StringRef getUnifiedFormatName(unsigned Id, const MCSubtargetInfo &STI); -bool isValidUnifiedFormat(unsigned Val); +bool isValidUnifiedFormat(unsigned Val, const MCSubtargetInfo &STI); -int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt); +int64_t convertDfmtNfmt2Ufmt(unsigned Dfmt, unsigned Nfmt, + const MCSubtargetInfo &STI); bool isValidFormatEncoding(unsigned Val, const MCSubtargetInfo &STI); @@ -670,19 +724,19 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI); namespace SendMsg { LLVM_READONLY -int64_t getMsgId(const StringRef Name); +int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI); LLVM_READONLY int64_t getMsgOpId(int64_t MsgId, const StringRef Name); LLVM_READNONE -StringRef getMsgName(int64_t MsgId); +StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI); LLVM_READNONE -StringRef getMsgOpName(int64_t MsgId, int64_t OpId); +StringRef getMsgOpName(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI); LLVM_READNONE -bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI, bool Strict = true); +bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI); LLVM_READNONE bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI, @@ -693,15 +747,13 @@ bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId, const MCSubtargetInfo &STI, bool Strict = true); LLVM_READNONE -bool msgRequiresOp(int64_t MsgId); +bool msgRequiresOp(int64_t MsgId, const MCSubtargetInfo &STI); LLVM_READNONE -bool msgSupportsStream(int64_t MsgId, int64_t OpId); +bool msgSupportsStream(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI); -void decodeMsg(unsigned Val, - uint16_t &MsgId, - uint16_t &OpId, - uint16_t &StreamId); +void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId, + uint16_t &StreamId, const MCSubtargetInfo &STI); LLVM_READNONE uint64_t encodeMsg(uint64_t MsgId, @@ -738,6 +790,8 @@ bool isEntryFunctionCC(CallingConv::ID CC); LLVM_READNONE bool isModuleEntryFunctionCC(CallingConv::ID CC); +bool isKernelCC(const Function *Func); + // FIXME: Remove this when calling conventions cleaned up LLVM_READNONE inline bool isKernel(CallingConv::ID CC) { @@ -761,22 +815,31 @@ bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX9_GFX10(const MCSubtargetInfo &STI); +bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI); +bool isGFX8Plus(const MCSubtargetInfo &STI); bool isGFX9Plus(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); bool isGFX10Plus(const MCSubtargetInfo &STI); +bool isNotGFX10Plus(const MCSubtargetInfo &STI); +bool isGFX10Before1030(const MCSubtargetInfo &STI); +bool isGFX11(const MCSubtargetInfo &STI); +bool isGFX11Plus(const MCSubtargetInfo &STI); +bool isNotGFX11Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); bool isGFX10_AEncoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); bool isGFX90A(const MCSubtargetInfo &STI); +bool isGFX940(const MCSubtargetInfo &STI); bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); +bool hasMAIInsts(const MCSubtargetInfo &STI); +bool hasVOPD(const MCSubtargetInfo &STI); +int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); -/// Is there any intersection between registers -bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI); - /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); @@ -931,7 +994,7 @@ inline bool isLegal64BitDPPControl(unsigned DC) { /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); -// Track defaults for fields in the MODE registser. +// Track defaults for fields in the MODE register. struct SIModeRegisterDefaults { /// Floating point opcodes that support exception flag gathering quiet and /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h deleted file mode 100644 index 83ef68cc3f60..000000000000 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ /dev/null @@ -1,38 +0,0 @@ -//===- AMDGPULDSUtils.h - LDS related helper functions -*- C++ -*----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// AMDGPU LDS related helper utility functions. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H -#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/IR/Constants.h" - -namespace llvm { - -class ConstantExpr; - -namespace AMDGPU { - -bool isKernelCC(const Function *Func); - -Align getAlign(DataLayout const &DL, const GlobalVariable *GV); - -std::vector<GlobalVariable *> findVariablesToLower(Module &M, - const Function *F = nullptr); - -/// Replace all uses of constant \p C with instructions in \p F. -void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); -} // end namespace AMDGPU - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index a83ff6667956..83d7cbdb183c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -1,33 +1,32 @@ -//===- AMDGPULDSUtils.cpp -------------------------------------------------===// +//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// AMDGPU LDS related helper utility functions. -// -//===----------------------------------------------------------------------===// -#include "AMDGPULDSUtils.h" +#include "AMDGPUMemoryUtils.h" #include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" +#include "AMDGPUBaseInfo.h" #include "llvm/ADT/SetVector.h" -#include "llvm/IR/Constants.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/ReplaceConstant.h" +#define DEBUG_TYPE "amdgpu-memory-utils" + using namespace llvm; namespace llvm { namespace AMDGPU { -bool isKernelCC(const Function *Func) { - return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); -} - Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), GV->getValueType()); @@ -139,6 +138,83 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M, return LocalVars; } +bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { + Instruction *DefInst = Def->getMemoryInst(); + + if (isa<FenceInst>(DefInst)) + return false; + + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_wave_barrier: + case Intrinsic::amdgcn_sched_barrier: + return false; + default: + break; + } + } + + // Ignore atomics not aliasing with the original load, any atomic is a + // universal MemoryDef from MSSA's point of view too, just like a fence. + const auto checkNoAlias = [AA, Ptr](auto I) -> bool { + return I && AA->isNoAlias(I->getPointerOperand(), Ptr); + }; + + if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || + checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) + return false; + + return true; +} + +bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, + AAResults *AA) { + MemorySSAWalker *Walker = MSSA->getWalker(); + SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; + SmallSet<MemoryAccess *, 8> Visited; + MemoryLocation Loc(MemoryLocation::get(Load)); + + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); + + // Start with a nearest dominating clobbering access, it will be either + // live on entry (nothing to do, load is not clobbered), MemoryDef, or + // MemoryPhi if several MemoryDefs can define this memory state. In that + // case add all Defs to WorkList and continue going up and checking all + // the definitions of this memory location until the root. When all the + // defs are exhausted and came to the entry state we have no clobber. + // Along the scan ignore barriers and fences which are considered clobbers + // by the MemorySSA, but not really writing anything into the memory. + while (!WorkList.empty()) { + MemoryAccess *MA = WorkList.pop_back_val(); + if (!Visited.insert(MA).second) + continue; + + if (MSSA->isLiveOnEntryDef(MA)) + continue; + + if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { + LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n'); + + if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) { + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + return true; + } + + WorkList.push_back( + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); + continue; + } + + const MemoryPhi *Phi = cast<MemoryPhi>(MA); + for (auto &Use : Phi->incoming_values()) + WorkList.push_back(cast<MemoryAccess>(&Use)); + } + + LLVM_DEBUG(dbgs() << " -> no clobber\n"); + return false; +} + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h new file mode 100644 index 000000000000..65ed02ca62de --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -0,0 +1,51 @@ +//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H + +#include <vector> + +namespace llvm { + +struct Align; +class AAResults; +class ConstantExpr; +class DataLayout; +class Function; +class GlobalVariable; +class LoadInst; +class MemoryDef; +class MemorySSA; +class Module; +class Value; + +namespace AMDGPU { + +Align getAlign(DataLayout const &DL, const GlobalVariable *GV); + +std::vector<GlobalVariable *> findVariablesToLower(Module &M, + const Function *F = nullptr); + +/// Replace all uses of constant \p C with instructions in \p F. +void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); + +/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check +/// if this is actually a memory update or an artificial clobber to facilitate +/// ordering constraints. +bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA); + +/// Check is a \p Load is clobbered in its function. +bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, + AAResults *AA); + +} // end namespace AMDGPU + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index f6b5975f1934..4ad93f7b0b68 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -209,6 +209,11 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val); } +// Set the number of used agprs in the metadata. +void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) { + getHwStage(CC)[".agpr_count"] = Val; +} + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 7fdd9a8429c1..a45a799e38a9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -69,6 +69,10 @@ public: // the shader stage to determine the number of vgprs to allocate. void setNumUsedVgprs(unsigned CC, unsigned Val); + // Set the number of used agprs in the metadata. This is an optional advisory + // record for logging etc; + void setNumUsedAgprs(unsigned CC, unsigned Val); + // Set the number of used sgprs in the metadata. This is an optional advisory // record for logging etc; wave dispatch actually uses the rsrc1 register for // the shader stage to determine the number of sgprs to allocate. diff --git a/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/llvm/lib/Target/AMDGPU/VIInstrFormats.td index bd65a495fa72..7393ef6c2a2d 100644 --- a/llvm/lib/Target/AMDGPU/VIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/VIInstrFormats.td @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -class EXPe_vi : EXPe { +class EXPe_vi : EXPe_ComprVM { let Inst{31-26} = 0x31; //encoding } diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td new file mode 100644 index 000000000000..c63fbbc241d9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -0,0 +1,180 @@ +//===-- VINTERPInstructions.td - VINTERP Instruction Definitions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VINTERP encoding +//===----------------------------------------------------------------------===// + +class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 { + bits<8> vdst; + bits<4> src0_modifiers; + bits<9> src0; + bits<3> src1_modifiers; + bits<9> src1; + bits<3> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<3> waitexp; + + let Inst{31-26} = 0x33; // VOP3P encoding + let Inst{25-24} = 0x1; // VINTERP sub-encoding + let Inst{23} = 0; // reserved + + let Inst{7-0} = vdst; + let Inst{10-8} = waitexp; + let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2) + let Inst{14} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3) + let Inst{15} = clamp; + let Inst{22-16} = op; + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{61} = src0_modifiers{0}; // neg(0) + let Inst{62} = src1_modifiers{0}; // neg(1) + let Inst{63} = src2_modifiers{0}; // neg(2) +} + +//===----------------------------------------------------------------------===// +// VOP3 VINTERP +//===----------------------------------------------------------------------===// + +class VINTERP_Pseudo <string OpName, VOPProfile P, list<dag> pattern = []> : + VOP3_Pseudo<OpName, P, pattern, 0, 0> { + let AsmMatchConverter = "cvtVINTERP"; + let mayRaiseFPException = 0; + + let VOP3_OPSEL = 1; + let VINTERP = 1; +} + +class VINTERP_Real <VOP_Pseudo ps, int EncodingFamily> : + VOP3_Real <ps, EncodingFamily> { + let VINTERP = 1; +} + +def VOP3_VINTERP_F32 : VOPProfile<[f32, f32, f32, f32]> { + let HasOpSel = 0; + let HasModifiers = 1; + + let Outs64 = (outs VGPR_32:$vdst); + let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Src1Mod:$src1_modifiers, VRegSrc_32:$src1, + Src2Mod:$src2_modifiers, VRegSrc_32:$src2, + clampmod:$clamp, + wait_exp:$waitexp); + + let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$waitexp"; +} + +class VOP3_VINTERP_F16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { + let HasOpSel = 1; + let HasModifiers = 1; + + let Outs64 = (outs VGPR_32:$vdst); + let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Src1Mod:$src1_modifiers, VRegSrc_32:$src1, + Src2Mod:$src2_modifiers, VRegSrc_32:$src2, + clampmod:$clamp, op_sel0:$op_sel, + wait_exp:$waitexp); + + let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$op_sel$waitexp"; +} + +//===----------------------------------------------------------------------===// +// VINTERP Pseudo Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGFX11Plus in { + +let Uses = [M0, EXEC, MODE] in { +def V_INTERP_P10_F32_inreg : VINTERP_Pseudo <"v_interp_p10_f32", VOP3_VINTERP_F32>; +def V_INTERP_P2_F32_inreg : VINTERP_Pseudo <"v_interp_p2_f32", VOP3_VINTERP_F32>; +def V_INTERP_P10_F16_F32_inreg : + VINTERP_Pseudo <"v_interp_p10_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>; +def V_INTERP_P2_F16_F32_inreg : + VINTERP_Pseudo <"v_interp_p2_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>; +} // Uses = [M0, EXEC, MODE] + +let Uses = [M0, EXEC] in { +def V_INTERP_P10_RTZ_F16_F32_inreg : + VINTERP_Pseudo <"v_interp_p10_rtz_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>; +def V_INTERP_P2_RTZ_F16_F32_inreg : + VINTERP_Pseudo <"v_interp_p2_rtz_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>; +} // Uses = [M0, EXEC] + +} // SubtargetPredicate = isGFX11Plus + +class VInterpF32Pat <SDPatternOperator op, Instruction inst> : GCNPat < + (f32 (op + (VINTERPMods f32:$src0, i32:$src0_modifiers), + (VINTERPMods f32:$src1, i32:$src1_modifiers), + (VINTERPMods f32:$src2, i32:$src2_modifiers))), + (inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + 0, /* clamp */ + 7) /* wait_exp */ +>; + +def VINTERP_OPSEL { + int LOW = 0; + int HIGH = 0xa; +} + +class VInterpF16Pat <SDPatternOperator op, Instruction inst, + ValueType dst_type, bit high, + list<ComplexPattern> pat> : GCNPat < + (dst_type (op + (pat[0] f32:$src0, i32:$src0_modifiers), + (pat[1] f32:$src1, i32:$src1_modifiers), + (pat[2] f32:$src2, i32:$src2_modifiers), + !if(high, (i1 -1), (i1 0)))), + (inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + 0, /* clamp */ + /* op_sel = 0 */ + 7) /* wait_exp */ +>; + +multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst, + ValueType dst_type, list<ComplexPattern> high_pat> { + def : VInterpF16Pat<op, inst, dst_type, 0, + [VINTERPMods, VINTERPMods, VINTERPMods]>; + def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>; +} + +def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>; +def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>; +defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16, + V_INTERP_P10_F16_F32_inreg, f32, + [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>; +defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16, + V_INTERP_P2_F16_F32_inreg, f16, + [VINTERPModsHi, VINTERPMods, VINTERPMods]>; + +//===----------------------------------------------------------------------===// +// VINTERP Real Instructions +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in { + multiclass VINTERP_Real_gfx11 <bits<7> op> { + def _gfx11 : + VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX11>, + VINTERPe_gfx11<op, !cast<VOP3_Pseudo>(NAME).Pfl>; + } +} + +defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>; +defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>; +defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>; +defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>; +defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>; +defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 48548d8b6722..1d374a9f90ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -59,9 +59,9 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On let AsmVariantName = AMDGPUAsmVariants.Default; } -class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : +class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic > : VOP_Real <ps>, - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let VALU = 1; @@ -110,13 +110,18 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { } multiclass VOP1Inst <string opName, VOPProfile P, - SDPatternOperator node = null_frag> { + SDPatternOperator node = null_frag, int VOPDOp = -1> { // We only want to set this on the basic, non-SDWA or DPP forms. - defvar should_mov_imm = !eq(opName, "v_mov_b32"); + defvar should_mov_imm = !or(!eq(opName, "v_mov_b32"), + !eq(opName, "v_mov_b64")); let isMoveImm = should_mov_imm in { - def _e32 : VOP1_Pseudo <opName, P>; - def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; + if !eq(VOPDOp, -1) then + def _e32 : VOP1_Pseudo <opName, P>; + else + // Only for V_MOV_B32 + def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, "v_mov_b32">; + def _e64 : VOP3InstBase <opName, P, node>; } foreach _ = BoolToList<P.HasExtSDWA>.ret in @@ -125,6 +130,11 @@ multiclass VOP1Inst <string opName, VOPProfile P, foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP1_DPP_Pseudo <opName, P>; + let SubtargetPredicate = isGFX11Plus in { + foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; + } // End SubtargetPredicate = isGFX11Plus + def : MnemonicAlias<opName#"_e32", opName>, LetDummies; def : MnemonicAlias<opName#"_e64", opName>, LetDummies; @@ -141,7 +151,9 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> : VOPProfile<[dstVt, srcVt, untyped, untyped]> { let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod); + let InsVOP3Base = (ins Src0DPP:$src0, clampmod:$clamp, omod:$omod); let Asm64 = "$vdst, $src0$clamp$omod"; + let AsmVOP3DPPBase = Asm64; let HasModifiers = 0; let HasClamp = 1; @@ -151,6 +163,12 @@ def VOP1_F64_I32 : VOPProfileI2F <f64, i32>; def VOP1_F32_I32 : VOPProfileI2F <f32, i32>; def VOP1_F16_I16 : VOPProfileI2F <f16, i16>; +def VOP_NOP_PROFILE : VOPProfile <[untyped, untyped, untyped, untyped]>{ + let HasExtVOP3DPP = 0; +} + +// OMod clears exceptions when set. OMod was always an operand, but its +// now explicitly set. class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> : VOPProfile<[dstVt, srcVt, untyped, untyped]> { @@ -165,11 +183,21 @@ def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>; //===----------------------------------------------------------------------===// let VOPAsmPrefer32Bit = 1 in { -defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>; +defm V_NOP : VOP1Inst <"v_nop", VOP_NOP_PROFILE>; +} + +def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> { + let InsVOPDX = (ins Src0RC32:$src0X); + let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X); + let InsVOPDY = (ins Src0RC32:$src0Y); + let InsVOPDYDeferred = (ins VSrc_f32_Deferred:$src0Y); } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { -defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; +defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOPProfile_MOV, null_frag, 0x8>; + +let SubtargetPredicate = isGFX940Plus in +defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>; } // End isMoveImm = 1 // FIXME: Specify SchedRW for READFIRSTLANE_B32 @@ -282,7 +310,7 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; +defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag<bitreverse>>; defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; @@ -472,7 +500,7 @@ let SubtargetPredicate = isGFX9Only in { } // End SubtargetPredicate = isGFX9Only let SubtargetPredicate = isGFX10Plus in { - defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NONE>; + defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>; let Uses = [M0] in { defm V_MOVRELSD_2_B32 : @@ -498,6 +526,17 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1 let isAsCheapAsAMove = 1; } +let SubtargetPredicate = isGFX11Plus in { + // Restrict src0 to be VGPR + def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, + getVOP1Pat64<int_amdgcn_permlane64, + VOP_MOVRELS>.ret, + /*VOP1Only=*/ 1>; + defm V_NOT_B16 : VOP1Inst<"v_not_b16", VOP_I16_I16>; + defm V_CVT_I32_I16 : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>; + defm V_CVT_U32_U16 : VOP1Inst<"v_cvt_u32_u16", VOP_I16_I16>; +} // End SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// @@ -517,9 +556,9 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1 let Inst{31-25} = 0x3f; } -class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> : +class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = ps.Pfl> : VOP1_DPP<op, ps, p, 1>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> { + SIMCInstr <ps.PseudoInstr, subtarget> { let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; } @@ -539,10 +578,112 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : } //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { + multiclass VOP1Only_Real_gfx11<bits<9> op> { + let IsSingle = 1 in + def _gfx11 : + VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>, + VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>; + } + multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _e32_gfx11 : + VOP1_Real<ps, SIEncodingFamily.GFX11>, + VOP1e<op{7-0}, ps.Pfl>; + } + multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.AsmOperands in { + defm NAME : VOP1_Real_e32_gfx11<op, opName>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + } + } + multiclass VOP1_Real_e64_gfx11<bits<9> op> { + def _e64_gfx11 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>, + VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + multiclass VOP1_Real_dpp_gfx11<bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _dpp_gfx11 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> { + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in { + defm NAME : VOP1_Real_dpp_gfx11<op, opName>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + } + } + multiclass VOP1_Real_dpp8_gfx11<bits<9> op, string opName = NAME> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + def _dpp8_gfx11 : VOP1_DPP8<op{7-0}, ps> { + let DecoderNamespace = "DPP8GFX11"; + } + } + multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName, + string asmName> { + defvar ps = !cast<VOP1_Pseudo>(opName#"_e32"); + let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in { + defm NAME : VOP1_Real_dpp8_gfx11<op, opName>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + } + } +} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" + +multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> { + defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>; +} +multiclass VOP1_Realtriple_e64_with_name_gfx11<bits<9> op, string opName, + string asmName> { + defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 1, op{6-0}}, opName, + asmName>; +} + +multiclass VOP1_Real_FULL_gfx11<bits<9> op> : + VOP1_Real_e32_gfx11<op>, VOP1_Realtriple_e64_gfx11<op>, + VOP1_Real_dpp_gfx11<op>, VOP1_Real_dpp8_gfx11<op>; + +multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName, + string asmName> : + VOP1_Real_e32_with_name_gfx11<op, opName, asmName>, + VOP1_Real_dpp_with_name_gfx11<op, opName, asmName>, + VOP1_Real_dpp8_with_name_gfx11<op, opName, asmName>; + +multiclass VOP1_Real_FULL_with_name_gfx11<bits<9> op, string opName, + string asmName> : + VOP1_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>, + VOP1_Realtriple_e64_with_name_gfx11<op, opName, asmName>; + +multiclass VOP1_Real_NO_DPP_gfx11<bits<9> op> : + VOP1_Real_e32_gfx11<op>, VOP1_Real_e64_gfx11<op>; + +defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00c, + "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; +defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00d, + "V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">; +defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11<0x039, + "V_FFBH_U32", "v_clz_i32_u32">; +defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a, + "V_FFBL_B32", "v_ctz_i32_b32">; +defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b, + "V_FFBH_I32", "v_cls_i32">; +defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>; +defm V_NOT_B16 : VOP1_Real_FULL_gfx11<0x069>; +defm V_CVT_I32_I16 : VOP1_Real_FULL_gfx11<0x06a>; +defm V_CVT_U32_U16 : VOP1_Real_FULL_gfx11<0x06b>; + +//===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass VOP1Only_Real_gfx10<bits<9> op> { def _gfx10 : VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX10>, @@ -567,50 +708,59 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP1_Real_dpp_gfx10<bits<9> op> { - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in - def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> { + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> { let DecoderNamespace = "SDWA10"; } } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" multiclass VOP1_Real_gfx10<bits<9> op> : VOP1_Real_e32_gfx10<op>, VOP1_Real_e64_gfx10<op>, VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>, VOP1_Real_dpp8_gfx10<op>; -defm V_PIPEFLUSH : VOP1_Real_gfx10<0x01b>; -defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10<0x048>; -defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>; -defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>; -defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>; -defm V_CVT_I16_F16 : VOP1_Real_gfx10<0x053>; -defm V_RCP_F16 : VOP1_Real_gfx10<0x054>; -defm V_SQRT_F16 : VOP1_Real_gfx10<0x055>; -defm V_RSQ_F16 : VOP1_Real_gfx10<0x056>; -defm V_LOG_F16 : VOP1_Real_gfx10<0x057>; -defm V_EXP_F16 : VOP1_Real_gfx10<0x058>; -defm V_FREXP_MANT_F16 : VOP1_Real_gfx10<0x059>; -defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10<0x05a>; -defm V_FLOOR_F16 : VOP1_Real_gfx10<0x05b>; -defm V_CEIL_F16 : VOP1_Real_gfx10<0x05c>; -defm V_TRUNC_F16 : VOP1_Real_gfx10<0x05d>; -defm V_RNDNE_F16 : VOP1_Real_gfx10<0x05e>; -defm V_FRACT_F16 : VOP1_Real_gfx10<0x05f>; -defm V_SIN_F16 : VOP1_Real_gfx10<0x060>; -defm V_COS_F16 : VOP1_Real_gfx10<0x061>; -defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>; -defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>; -defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>; +multiclass VOP1_Real_gfx10_FULL_gfx11<bits<9> op> : + VOP1_Real_gfx10<op>, VOP1_Real_FULL_gfx11<op>; + +multiclass VOP1_Real_gfx10_NO_DPP_gfx11<bits<9> op> : + VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; -defm V_SWAP_B32 : VOP1Only_Real_gfx10<0x065>; -defm V_SWAPREL_B32 : VOP1Only_Real_gfx10<0x068>; +multiclass VOP1Only_Real_gfx10_gfx11<bits<9> op> : + VOP1Only_Real_gfx10<op>, VOP1Only_Real_gfx11<op>; + +defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11<0x01b>; +defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11<0x048>; +defm V_CVT_F16_U16 : VOP1_Real_gfx10_FULL_gfx11<0x050>; +defm V_CVT_F16_I16 : VOP1_Real_gfx10_FULL_gfx11<0x051>; +defm V_CVT_U16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x052>; +defm V_CVT_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x053>; +defm V_RCP_F16 : VOP1_Real_gfx10_FULL_gfx11<0x054>; +defm V_SQRT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x055>; +defm V_RSQ_F16 : VOP1_Real_gfx10_FULL_gfx11<0x056>; +defm V_LOG_F16 : VOP1_Real_gfx10_FULL_gfx11<0x057>; +defm V_EXP_F16 : VOP1_Real_gfx10_FULL_gfx11<0x058>; +defm V_FREXP_MANT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x059>; +defm V_FREXP_EXP_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05a>; +defm V_FLOOR_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05b>; +defm V_CEIL_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05c>; +defm V_TRUNC_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05d>; +defm V_RNDNE_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05e>; +defm V_FRACT_F16 : VOP1_Real_gfx10_FULL_gfx11<0x05f>; +defm V_SIN_F16 : VOP1_Real_gfx10_FULL_gfx11<0x060>; +defm V_COS_F16 : VOP1_Real_gfx10_FULL_gfx11<0x061>; +defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10_FULL_gfx11<0x062>; +defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x063>; +defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10_FULL_gfx11<0x064>; + +defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11<0x065>; +defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11<0x068>; //===----------------------------------------------------------------------===// // GFX7, GFX10. @@ -635,16 +785,19 @@ multiclass VOP1_Real_gfx7<bits<9> op> : multiclass VOP1_Real_gfx7_gfx10<bits<9> op> : VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>; +multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> : + VOP1_Real_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; + defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>; defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>; -defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10<0x017>; -defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10<0x018>; -defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10<0x019>; -defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10<0x01a>; +defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x017>; +defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x018>; +defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x019>; +defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x01a>; //===----------------------------------------------------------------------===// -// GFX6, GFX7, GFX10. +// GFX6, GFX7, GFX10, GFX11. //===----------------------------------------------------------------------===// let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in { @@ -666,65 +819,71 @@ multiclass VOP1_Real_gfx6_gfx7<bits<9> op> : multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> : VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>; -defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; -defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; -defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>; -defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>; -defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; -defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; -defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; +multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<bits<9> op> : + VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL_gfx11<op>; -defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10<0x000>; -defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x001>; -defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x003>; -defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x004>; -defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x005>; -defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x006>; -defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x007>; -defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x008>; -defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>; -defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>; +multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> : + VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>; + +defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>; +defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>; +defm V_RCP_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x029>; +defm V_RSQ_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x02c>; +defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>; +defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>; +defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>; + +defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x000>; +defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x001>; +defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x003>; +defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x004>; +defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x005>; +defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x006>; +defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x007>; +defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x008>; +defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00a>; +defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00b>; defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>; defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>; -defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10<0x00e>; -defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x00f>; -defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x010>; -defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10<0x011>; -defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10<0x012>; -defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10<0x013>; -defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10<0x014>; -defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x015>; -defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x016>; -defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x020>; -defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x021>; -defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x022>; -defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x023>; -defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x024>; -defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x025>; -defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x027>; -defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02a>; -defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02b>; -defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x02e>; -defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x02f>; -defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x031>; -defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x033>; -defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x034>; -defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x035>; -defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x036>; -defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x037>; -defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x038>; +defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00e>; +defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x00f>; +defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x010>; +defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x011>; +defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x012>; +defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x013>; +defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x014>; +defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x015>; +defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x016>; +defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x020>; +defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x021>; +defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x022>; +defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x023>; +defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x024>; +defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x025>; +defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x027>; +defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02a>; +defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02b>; +defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02e>; +defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x02f>; +defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x031>; +defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x033>; +defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x034>; +defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x035>; +defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x036>; +defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x037>; +defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x038>; defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>; defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>; defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>; -defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03c>; -defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03d>; -defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10<0x03e>; -defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x03f>; -defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x040>; +defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03c>; +defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03d>; +defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03e>; +defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x03f>; +defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x040>; defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>; -defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x042>; -defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x043>; -defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x044>; +defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x042>; +defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x043>; +defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x044>; //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). @@ -949,14 +1108,29 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; +let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in +defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>; + //===----------------------------------------------------------------------===// // GFX10 //===----------------------------------------------------------------------===// -let OtherPredicates = [isGFX10Plus] in { +let OtherPredicates = [isGFX10Only] in { def : GCNPat < (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), (V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0)) >; -} // End OtherPredicates = [isGFX10Plus] +} // End OtherPredicates = [isGFX10Only] + +//===----------------------------------------------------------------------===// +// GFX11 +//===----------------------------------------------------------------------===// + +let OtherPredicates = [isGFX11Only] in { +def : GCNPat < + (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), + (V_MOV_B32_dpp8_gfx11 VGPR_32:$src, VGPR_32:$src, + (as_i32timm $dpp8), (i32 DPP8Mode.FI_0)) +>; +} // End OtherPredicates = [isGFX11Only] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index b9ff814a4dc5..1485a1e63129 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -80,9 +80,9 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf let AsmVariantName = AMDGPUAsmVariants.Default; } -class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : +class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemonic> : VOP_Real <ps>, - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let VALU = 1; @@ -140,15 +140,26 @@ multiclass VOP2Inst_e32<string opName, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; } // End renamedInGFX9 = GFX9Renamed } - +multiclass + VOP2Inst_e32_VOPD<string opName, VOPProfile P, bits<5> VOPDOp, + string VOPDName, SDPatternOperator node = null_frag, + string revOp = opName, bit GFX9Renamed = 0> { + defm NAME : VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>, + VOPD_Component<VOPDOp, VOPDName>; +} multiclass VOP2Inst_e64<string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName, bit GFX9Renamed = 0> { let renamedInGFX9 = GFX9Renamed in { - def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + def _e64 : VOP3InstBase <opName, P, node, 1>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + + let SubtargetPredicate = isGFX11Plus in { + foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; + } // End SubtargetPredicate = isGFX11Plus } // End renamedInGFX9 = GFX9Renamed } @@ -175,6 +186,22 @@ multiclass VOP2Inst<string opName, } } +multiclass VOP2Inst_VOPD<string opName, + VOPProfile P, + bits<5> VOPDOp, + string VOPDName, + SDPatternOperator node = null_frag, + string revOp = opName, + bit GFX9Renamed = 0> : + VOP2Inst_e32_VOPD<opName, P, VOPDOp, VOPDName, node, revOp, GFX9Renamed>, + VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, + VOP2Inst_sdwa<opName, P, GFX9Renamed> { + let renamedInGFX9 = GFX9Renamed in { + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; + } +} + multiclass VOP2bInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, @@ -195,10 +222,15 @@ multiclass VOP2bInst <string opName, } foreach _ = BoolToList<P.HasExtDPP>.ret in def _dpp : VOP2_DPP_Pseudo <opName, P>; - } + } // End Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] - def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + def _e64 : VOP3InstBase <opName, P, node, 1>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + + let SubtargetPredicate = isGFX11Plus in { + foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; + } // End SubtargetPredicate = isGFX11Plus } } } @@ -220,16 +252,19 @@ multiclass VOP2bInstAliases<VOP2_Pseudo ps, VOP2_Real inst, string OpName> { } } -multiclass VOP2eInst <string opName, - VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName, - bit useSGPRInput = !eq(P.NumSrcArgs, 3)> { +multiclass + VOP2eInst_Base<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName, + SDPatternOperator node, string revOp, bit useSGPRInput> { let SchedRW = [Write32Bit] in { let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { - def _e32 : VOP2_Pseudo <opName, P>, - Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + if !eq(VOPDOp, -1) then + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + else + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>, + VOPD_Component<VOPDOp, VOPDName>; foreach _ = BoolToList<P.HasExtSDWA>.ret in def _sdwa : VOP2_SDWA_Pseudo <opName, P> { @@ -240,13 +275,29 @@ multiclass VOP2eInst <string opName, def _dpp : VOP2_DPP_Pseudo <opName, P>; } - def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + def _e64 : VOP3InstBase <opName, P, node, 1>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { let isReMaterializable = 1; } + + let SubtargetPredicate = isGFX11Plus in { + foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; + } // End SubtargetPredicate = isGFX11Plus } } +multiclass + VOP2eInst<string opName, VOPProfile P, SDPatternOperator node = null_frag, + string revOp = opName, bit useSGPRInput = !eq(P.NumSrcArgs, 3)> + : VOP2eInst_Base<opName, P, -1, "", node, revOp, useSGPRInput>; + +multiclass + VOP2eInst_VOPD<string opName, VOPProfile P, bits<5> VOPDOp, string VOPDName, + SDPatternOperator node = null_frag, string revOp = opName, + bit useSGPRInput = !eq(P.NumSrcArgs, 3)> + : VOP2eInst_Base<opName, P, VOPDOp, VOPDName, node, revOp, useSGPRInput>; + class VOP2eInstAlias <VOP2_Pseudo ps, Instruction inst, string opnd = ""> : InstAlias <ps.OpName#" "#ps.Pfl.Asm32#", "#opnd, (inst ps.Pfl.DstRC:$vdst, ps.Pfl.Src0RC32:$src0, @@ -267,12 +318,24 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> { } } -class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { +class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { + string AsmVOPDXDeferred = ?; +} + +class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm), (ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm)); + field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$imm); + // Note that both src0X and imm are deferred + let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, VGPR_32:$vsrc1X, ImmOpType:$immDeferred); + field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, VGPR_32:$vsrc1Y, ImmOpType:$imm); + field string Asm32 = "$vdst, $src0, $src1, $imm"; + field string AsmVOPDX = "$vdstX, $src0X, $vsrc1X, $imm"; + let AsmVOPDXDeferred = "$vdstX, $src0X, $vsrc1X, $immDeferred"; + field string AsmVOPDY = "$vdstY, $src0Y, $vsrc1Y, $imm"; field bit HasExt = 0; let IsSingle = 1; } @@ -280,10 +343,17 @@ class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { def VOP_MADAK_F16 : VOP_MADAK <f16>; def VOP_MADAK_F32 : VOP_MADAK <f32>; -class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { +class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1); + field dag InsVOPDX = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$imm, VGPR_32:$vsrc1X); + let InsVOPDXDeferred = (ins VSrc_f32_Deferred:$src0X, ImmOpType:$immDeferred, VGPR_32:$vsrc1X); + field dag InsVOPDY = (ins VSrc_f32_Deferred:$src0Y, ImmOpType:$imm, VGPR_32:$vsrc1Y); + field string Asm32 = "$vdst, $src0, $imm, $src1"; + field string AsmVOPDX = "$vdstX, $src0X, $imm, $vsrc1X"; + let AsmVOPDXDeferred = "$vdstX, $src0X, $immDeferred, $vsrc1X"; + field string AsmVOPDY = "$vdstY, $src0Y, $imm, $vsrc1Y"; field bit HasExt = 0; let IsSingle = 1; } @@ -308,6 +378,10 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let InsVOP3Base = getIns64<Src0VOP3DPP, Src1RC64, RegisterOperand<VGPR_32>, 3, + 0, HasModifiers, HasModifiers, HasOMod, + Src0Mod, Src1Mod, Src2Mod>.ret; + let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT<Src2VT>.ret:$src2, // stub argument @@ -330,6 +404,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v let HasExt = 1; let HasExtDPP = 1; + let HasExt32BitDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 0; let TieRegDPP = "$src2"; @@ -337,9 +412,9 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v def VOP_MAC_F16 : VOP_MAC <f16>; def VOP_MAC_F32 : VOP_MAC <f32>; -let HasExtDPP = 0 in +let HasExtDPP = 0, HasExt32BitDPP = 0 in def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>; -let HasExtSDWA = 0, HasExt64BitDPP = 1 in +let HasExtSDWA = 0, HasExt32BitDPP = 0, HasExt64BitDPP = 1 in def VOP_MAC_F64 : VOP_MAC <f64>; class VOP_DOT_ACC<ValueType vt0, ValueType vt1> : VOP_MAC<vt0, vt1> { @@ -355,6 +430,7 @@ def VOP_DOT_ACC_F32_V2F16 : VOP_DOT_ACC<f32, v2f16> { } def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> { + let HasExtVOP3DPP = 0; let HasSrc0Mods = 1; let HasSrc1Mods = 1; } @@ -368,13 +444,27 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi"; let AsmDPP16 = AsmDPP#"$fi"; + let AsmVOP3DPPBase = Asm64; + let InsDPP = (ins DstRCDPP:$old, + Src0DPP:$src0, + Src1DPP:$src1, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let InsDPP8 = (ins DstRCDPP:$old, + Src0DPP:$src0, + Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); + let OutsVOP3DPP = Outs64; + let OutsVOP3DPP8 = Outs64; } // Write out to vcc or arbitrary SGPR and read in from vcc or // arbitrary SGPR. def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> { + let HasSrc2Mods = 0; let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; @@ -384,6 +474,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* let AsmDPP16 = AsmDPP#"$fi"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); + let AsmVOP3DPPBase = Asm64; + let OutsVOP3DPP = Outs64; + let OutsVOP3DPP8 = Outs64; // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -401,15 +494,20 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let InsDPP8 = (ins DstRCDPP:$old, + Src0DPP:$src0, + Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let HasExt = 1; let HasExtDPP = 1; + let HasExt32BitDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; } // Read in from vcc or arbitrary SGPR. -def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/1> { +class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT, /*EnableF32SrcMods=*/1> { let Asm32 = "$vdst, $src0, $src1"; let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2"; let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; @@ -417,6 +515,7 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/ let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let AsmDPP8 = "$vdst, $src0, $src1, vcc $dpp8$fi"; let AsmDPP16 = AsmDPP#"$fi"; + let AsmVOP3DPPBase = Asm64; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst); @@ -437,14 +536,22 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableF32SrcMods=*/ dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + let InsDPP8 = (ins DstRCDPP:$old, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let HasExt = 1; let HasExtDPP = 1; + let HasExt32BitDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; } -def VOP_READLANE : VOPProfile<[i32, i32, i32]> { +def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>; +def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>; + +def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> { let Outs32 = (outs SReg_32:$vdst); let Outs64 = Outs32; let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1); @@ -454,6 +561,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let HasExt = 0; let HasExtDPP = 0; + let HasExt32BitDPP = 0; let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; @@ -471,6 +579,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { let HasExt = 0; let HasExtDPP = 0; + let HasExt32BitDPP = 0; let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; @@ -480,31 +589,33 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { // VOP2 Instructions //===----------------------------------------------------------------------===// -defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; +let SubtargetPredicate = isGFX11Plus in +defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1>; +defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">; let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { let isReMaterializable = 1 in { -defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>; -defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, any_fsub>; -defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; -defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>; -defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>; +defm V_ADD_F32 : VOP2Inst_VOPD <"v_add_f32", VOP_F32_F32_F32, 0x4, "v_add_f32", any_fadd>; +defm V_SUB_F32 : VOP2Inst_VOPD <"v_sub_f32", VOP_F32_F32_F32, 0x5, "v_sub_f32", any_fsub>; +defm V_SUBREV_F32 : VOP2Inst_VOPD <"v_subrev_f32", VOP_F32_F32_F32, 0x6, "v_subrev_f32", null_frag, "v_sub_f32">; +defm V_MUL_LEGACY_F32 : VOP2Inst_VOPD <"v_mul_legacy_f32", VOP_F32_F32_F32, 0x7, "v_mul_dx9_zero_f32", AMDGPUfmul_legacy>; +defm V_MUL_F32 : VOP2Inst_VOPD <"v_mul_f32", VOP_F32_F32_F32, 0x3, "v_mul_f32", any_fmul>; defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>; defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>; defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>; defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>; -defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>; -defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>; +defm V_MIN_F32 : VOP2Inst_VOPD <"v_min_f32", VOP_F32_F32_F32, 0xb, "v_min_f32", fminnum_like>; +defm V_MAX_F32 : VOP2Inst_VOPD <"v_max_f32", VOP_F32_F32_F32, 0xa, "v_max_f32", fmaxnum_like>; defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>; defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>; defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>; defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">; defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">; -defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">; -defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>; +defm V_LSHLREV_B32 : VOP2Inst_VOPD <"v_lshlrev_b32", VOP_I32_I32_I32, 0x11, "v_lshlrev_b32", clshl_rev_32, "v_lshl_b32">; +defm V_AND_B32 : VOP2Inst_VOPD <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, 0x12, "v_and_b32", and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; } // End isReMaterializable = 1 @@ -536,7 +647,7 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in { -defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>; +defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>; defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>; } @@ -555,20 +666,20 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, } // End isConvergent = 1 let isReMaterializable = 1 in { -defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; -defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>; -defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>; -defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; -defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; +defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; +defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>; +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_u16_f32>; } -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_V2F16_F32_F32, AMDGPUpkrtz_f16_f32>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_V2I16_I32_I32, AMDGPUpk_u16_u32>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_V2I16_I32_I32, AMDGPUpk_i16_i32>; let SubtargetPredicate = isGFX6GFX7 in { @@ -641,8 +752,9 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>; def : divergent_i64_BinOp <or, V_OR_B32_e64>; def : divergent_i64_BinOp <xor, V_XOR_B32_e64>; -let SubtargetPredicate = Has16BitInsts in { +let SubtargetPredicate = Has16BitInsts in { +let isReMaterializable = 1 in { let FPDPRounding = 1 in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; @@ -664,9 +776,7 @@ def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; -defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; + defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; @@ -675,12 +785,19 @@ defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>; -let Constraints = "$vdst = $src2", DisableEncoding="$src2", - isConvertibleToThreeAddress = 1 in { -defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; +let SubtargetPredicate = isGFX8GFX9 in { + defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; + defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; + defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; } } // End isCommutable = 1 +} // End isReMaterializable = 1 +// FIXME: Missing FPDPRounding +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, isCommutable = 1 in { +defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; +} } // End SubtargetPredicate = Has16BitInsts let SubtargetPredicate = HasDLInsts in { @@ -722,7 +839,7 @@ let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in -defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; +defm V_FMAC_F32 : VOP2Inst_VOPD <"v_fmac_f32", VOP_MAC_F32, 0x0, "v_fmac_f32">; } // End SubtargetPredicate = HasDLInsts @@ -750,7 +867,7 @@ let Constraints = "$vdst = $src2", isCommutable = 1, IsDOT = 1 in { let SubtargetPredicate = HasDot5Insts in - defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>; + defm V_DOT2C_F32_F16 : VOP2Inst_VOPD<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16, 0xc, "v_dot2acc_f32_f16">; let SubtargetPredicate = HasDot6Insts in defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>; @@ -788,20 +905,20 @@ let AddedComplexity = 30 in { } // End AddedComplexity = 30 let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in { -def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">; +def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">; let isCommutable = 1 in -def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">; +def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">; } let SubtargetPredicate = isGFX10Plus in { -let FPDPRounding = 1 in { +let FPDPRounding = 1, isReMaterializable = 1 in { def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; let isCommutable = 1 in def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; -} // End FPDPRounding = 1 +} // End FPDPRounding = 1, isReMaterializable = 1 let Constraints = "$vdst = $src2", DisableEncoding="$src2", @@ -857,7 +974,7 @@ def : GCNPat < >; } -let Predicates = [Has16BitInsts] in { +let Predicates = [Has16BitInsts, isGFX8GFX9] in { // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. @@ -867,9 +984,6 @@ def : GCNPat< (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) >; - -let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { - def : GCNPat< (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))), (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) @@ -885,7 +999,10 @@ defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>; defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>; defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>; defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>; -} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] + +} // End Predicates = [Has16BitInsts, isGFX8GFX9] + +let Predicates = [Has16BitInsts] in { def : ZExt_i16_i1_Pat<zext>; def : ZExt_i16_i1_Pat<anyext>; @@ -917,8 +1034,16 @@ def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>; def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>; } +let SubtargetPredicate = isGFX11Plus in { + let isCommutable = 1 in { + defm V_AND_B16 : VOP2Inst <"v_and_b16", VOP_I16_I16_I16, and>; + defm V_OR_B16 : VOP2Inst <"v_or_b16", VOP_I16_I16_I16, or>; + defm V_XOR_B16 : VOP2Inst <"v_xor_b16", VOP_I16_I16_I16, xor>; + } // End isCommutable = 1 +} // End SubtargetPredicate = isGFX11Plus + //===----------------------------------------------------------------------===// -// Target-specific instruction encodings. +// DPP Encodings //===----------------------------------------------------------------------===// class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps, @@ -947,10 +1072,10 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, let OtherPredicates = ps.OtherPredicates; } -class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, +class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget, string opName = ps.OpName, VOPProfile p = ps.Pfl> : Base_VOP2_DPP16<op, ps, opName, p>, - SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>; + SIMCInstr <ps.PseudoInstr, subtarget>; class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, VOPProfile p = ps.Pfl> : @@ -973,10 +1098,253 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, } //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { + //===------------------------------- VOP2 -------------------------------===// + multiclass VOP2Only_Real_MADK_gfx11<bits<6> op> { + def _gfx11 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX11>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; + } + multiclass VOP2_Real_e32_gfx11<bits<6> op> { + def _e32_gfx11 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX11>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>; + } + multiclass VOP2Only_Real_e32_gfx11<bits<6> op> { + let IsSingle = 1 in + defm NAME: VOP2_Real_e32_gfx11<op>; + } + multiclass VOP2_Real_e64_gfx11<bits<6> op> { + def _e64_gfx11 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>, + VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; + } + multiclass VOP2_Real_dpp_gfx11<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> { + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP2_Real_dpp8_gfx11<bits<6> op> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "DPP8GFX11"; + } + } + + //===------------------------- VOP2 (with name) -------------------------===// + multiclass VOP2_Real_e32_with_name_gfx11<bits<6> op, string opName, + string asmName, bit single = 0> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def _e32_gfx11 : + VOP2_Real<ps, SIEncodingFamily.GFX11, asmName>, + VOP2e<op{5-0}, ps.Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> { + let AsmString = asmName # ps.AsmOperands; + let IsSingle = single; + } + } + multiclass VOP2_Real_e64_with_name_gfx11<bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, ps.Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]> { + let AsmString = asmName # ps.AsmOperands; + } + } + + multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in + def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), + SIEncodingFamily.GFX11> { + let AsmString = asmName # ps.Pfl.AsmDPP16; + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName, + string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in + def _dpp8_gfx11 : VOP2_DPP8<op, ps> { + let AsmString = asmName # ps.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8GFX11"; + } + } + + //===------------------------------ VOP2be ------------------------------===// + multiclass VOP2be_Real_e32_gfx11<bits<6> op, string opName, string asmName> { + defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); + def _e32_gfx11 : + VOP2_Real<ps, SIEncodingFamily.GFX11>, + VOP2e<op{5-0}, ps.Pfl> { + let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands); + } + } + multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx11 : + VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst(", vcc", "", AsmDPP); + let DecoderNamespace = "DPPGFX11"; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w32_gfx11 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_w64_gfx11 : + Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; + let AsmString = asmName # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_gfx11 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst(", vcc", "", AsmDPP8); + let DecoderNamespace = "DPP8GFX11"; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w32_gfx11 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8); + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp8_w64_gfx11 : + VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { + string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; + let AsmString = asmName # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + +} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" + +// We don't want to override separate decoderNamespaces within these +multiclass VOP2_Realtriple_e64_gfx11<bits<6> op> { + defm NAME : VOP3_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ; +} +multiclass VOP2_Realtriple_e64_with_name_gfx11<bits<6> op, string opName, + string asmName> { + defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 0, 0, op{5-0}}, opName, asmName> ; +} + +multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> : + VOP2be_Real_e32_gfx11<op, opName, asmName>, + VOP3be_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>, + VOP2be_Real_dpp_gfx11<op, opName, asmName>, + VOP2be_Real_dpp8_gfx11<op, opName, asmName>; + +// Only for CNDMASK +multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> : + VOP2_Real_e32_gfx11<op>, + VOP2_Realtriple_e64_gfx11<op>, + VOP2be_Real_dpp_gfx11<op, opName, asmName>, + VOP2be_Real_dpp8_gfx11<op, opName, asmName>; + +multiclass VOP2Only_Real_gfx11<bits<6> op> : + VOP2Only_Real_e32_gfx11<op>, + VOP2_Real_dpp_gfx11<op>, + VOP2_Real_dpp8_gfx11<op>; + +multiclass VOP2_Real_NO_VOP3_gfx11<bits<6> op> : + VOP2_Real_e32_gfx11<op>, VOP2_Real_dpp_gfx11<op>, VOP2_Real_dpp8_gfx11<op>; + +multiclass VOP2_Real_FULL_gfx11<bits<6> op> : + VOP2_Realtriple_e64_gfx11<op>, VOP2_Real_NO_VOP3_gfx11<op>; + +multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName, + string asmName, bit isSingle = 0> : + VOP2_Real_e32_with_name_gfx11<op, opName, asmName, isSingle>, + VOP2_Real_dpp_with_name_gfx11<op, opName, asmName>, + VOP2_Real_dpp8_with_name_gfx11<op, opName, asmName>; + +multiclass VOP2_Real_FULL_with_name_gfx11<bits<6> op, string opName, + string asmName> : + VOP2_Realtriple_e64_with_name_gfx11<op, opName, asmName>, + VOP2_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>; + +multiclass VOP2_Real_NO_DPP_gfx11<bits<6> op> : + VOP2_Real_e32_gfx11<op>, VOP2_Real_e64_gfx11<op>; + +multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName, + string asmName> : + VOP2_Real_e32_with_name_gfx11<op, opName, asmName>, + VOP2_Real_e64_with_name_gfx11<op, opName, asmName>; + +defm V_CNDMASK_B32 : VOP2e_Real_gfx11<0x001, "V_CNDMASK_B32", + "v_cndmask_b32">; +defm V_DOT2ACC_F32_F16 : VOP2_Real_NO_VOP3_with_name_gfx11<0x002, + "V_DOT2C_F32_F16", "v_dot2acc_f32_f16", 1>; +defm V_FMAC_DX9_ZERO_F32 : VOP2_Real_NO_DPP_with_name_gfx11<0x006, + "V_FMAC_LEGACY_F32", "v_fmac_dx9_zero_f32">; +defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11<0x007, + "V_MUL_LEGACY_F32", "v_mul_dx9_zero_f32">; +defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11<0x018>; +defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11<0x019>; +defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11<0x01a>; +defm V_ADD_CO_CI_U32 : + VOP2be_Real_gfx11<0x020, "V_ADDC_U32", "v_add_co_ci_u32">; +defm V_SUB_CO_CI_U32 : + VOP2be_Real_gfx11<0x021, "V_SUBB_U32", "v_sub_co_ci_u32">; +defm V_SUBREV_CO_CI_U32 : + VOP2be_Real_gfx11<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">; + +defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11<0x02f, + "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">; +defm V_PK_FMAC_F16 : VOP2Only_Real_gfx11<0x03c>; + +// VOP3 only. +defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>; +defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>; +defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>; +defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>; +defm V_CVT_PKNORM_I16_F32 : VOP3Only_Realtriple_gfx11<0x321>; +defm V_CVT_PKNORM_U16_F32 : VOP3Only_Realtriple_gfx11<0x322>; +defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>; +defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>; +defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>; +defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>; +defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>; + +let SubtargetPredicate = isGFX11Plus in { + defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>; + + defm : VOP2bInstAliases< + V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx11, "v_add_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx11, "v_sub_co_ci_u32">; + defm : VOP2bInstAliases< + V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx11, "v_subrev_co_ci_u32">; +} // End SubtargetPredicate = isGFX11Plus + +//===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { //===------------------------------- VOP2 -------------------------------===// multiclass VOP2Only_Real_MADK_gfx10<bits<6> op> { def _gfx10 : @@ -1011,13 +1379,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP2_Real_dpp_gfx10<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> { let DecoderNamespace = "SDWA10"; } } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -1056,15 +1424,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in - def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> { + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16; } } multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; @@ -1122,14 +1490,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp_gfx10 : - VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { + VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10, asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; let AsmString = asmName # !subst(", vcc", "", AsmDPP); let DecoderNamespace = "SDWA10"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp_w32_gfx10 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1137,7 +1505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp_w64_gfx10 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1147,14 +1515,14 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { } } multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); let DecoderNamespace = "DPP8"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_w32_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -1162,7 +1530,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in def _dpp8_w64_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -1189,7 +1557,10 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let IsSingle = 1; } } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" + +multiclass VOP2Only_Real_MADK_gfx10_gfx11<bits<6> op> : + VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK_gfx11<op>; multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> : VOP2be_Real_e32_gfx10<op, opName, asmName>, @@ -1209,7 +1580,10 @@ multiclass VOP2_Real_gfx10<bits<6> op> : VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>, VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>; -multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName, +multiclass VOP2_Real_gfx10_gfx11<bits<6> op> : + VOP2_Real_gfx10<op>, VOP2_Real_FULL_gfx11<op>; + +multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName, string asmName> : VOP2_Real_e32_gfx10_with_name<op, opName, asmName>, VOP2_Real_e64_gfx10_with_name<op, opName, asmName>, @@ -1217,36 +1591,41 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName, VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>, VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>; +multiclass VOP2_Real_with_name_gfx10_gfx11<bits<6> op, string opName, + string asmName> : + VOP2_Real_with_name_gfx10<op, opName, asmName>, + VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>; + // NB: Same opcode as v_mac_legacy_f32 let DecoderNamespace = "GFX10_B" in defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>; -defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>; -defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>; -defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>; -defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10<0x02d>; -defm V_ADD_F16 : VOP2_Real_gfx10<0x032>; -defm V_SUB_F16 : VOP2_Real_gfx10<0x033>; -defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>; -defm V_MUL_F16 : VOP2_Real_gfx10<0x035>; -defm V_FMAC_F16 : VOP2_Real_gfx10<0x036>; -defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10<0x037>; -defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>; -defm V_MAX_F16 : VOP2_Real_gfx10<0x039>; -defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>; -defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; +defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11<0x01e>; +defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11<0x02b>; +defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02c>; +defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02d>; +defm V_ADD_F16 : VOP2_Real_gfx10_gfx11<0x032>; +defm V_SUB_F16 : VOP2_Real_gfx10_gfx11<0x033>; +defm V_SUBREV_F16 : VOP2_Real_gfx10_gfx11<0x034>; +defm V_MUL_F16 : VOP2_Real_gfx10_gfx11<0x035>; +defm V_FMAC_F16 : VOP2_Real_gfx10_gfx11<0x036>; +defm V_FMAMK_F16 : VOP2Only_Real_MADK_gfx10_gfx11<0x037>; +defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10_gfx11<0x038>; +defm V_MAX_F16 : VOP2_Real_gfx10_gfx11<0x039>; +defm V_MIN_F16 : VOP2_Real_gfx10_gfx11<0x03a>; +defm V_LDEXP_F16 : VOP2_Real_gfx10_gfx11<0x03b>; let IsSingle = 1 in { -defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; + defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; } // VOP2 no carry-in, carry-out. defm V_ADD_NC_U32 : - VOP2_Real_gfx10_with_name<0x025, "V_ADD_U32", "v_add_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11<0x025, "V_ADD_U32", "v_add_nc_u32">; defm V_SUB_NC_U32 : - VOP2_Real_gfx10_with_name<0x026, "V_SUB_U32", "v_sub_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11<0x026, "V_SUB_U32", "v_sub_nc_u32">; defm V_SUBREV_NC_U32 : - VOP2_Real_gfx10_with_name<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; + VOP2_Real_with_name_gfx10_gfx11<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">; // VOP2 carry-in, carry-out. defm V_ADD_CO_CI_U32 : @@ -1275,7 +1654,7 @@ defm V_ADD_CO_U32 : VOP3beOnly_Real_gfx10<0x30f>; defm V_SUB_CO_U32 : VOP3beOnly_Real_gfx10<0x310>; defm V_SUBREV_CO_U32 : VOP3beOnly_Real_gfx10<0x319>; -let SubtargetPredicate = isGFX10Plus in { +let SubtargetPredicate = isGFX10Only in { defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx10>; defm : VOP2bInstAliases< @@ -1284,10 +1663,10 @@ let SubtargetPredicate = isGFX10Plus in { V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx10, "v_sub_co_ci_u32">; defm : VOP2bInstAliases< V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx10, "v_subrev_co_ci_u32">; -} // End SubtargetPredicate = isGFX10Plus +} // End SubtargetPredicate = isGFX10Only //===----------------------------------------------------------------------===// -// GFX6, GFX7, GFX10. +// GFX6, GFX7, GFX10, GFX11 //===----------------------------------------------------------------------===// class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : @@ -1338,6 +1717,9 @@ multiclass VOP2_Real_gfx6_gfx7<bits<6> op> : multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> : VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>; +multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<6> op> : + VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL_gfx11<op>; + multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> : VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>; @@ -1398,28 +1780,28 @@ let SubtargetPredicate = isGFX6GFX7 in { def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>; } // End SubtargetPredicate = isGFX6GFX7 -defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; -defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; -defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>; +defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>; +defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>; defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; -defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; -defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x009>; -defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10<0x00a>; -defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00b>; -defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10<0x00c>; -defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x00f>; -defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x010>; -defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x011>; -defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x012>; -defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x013>; -defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10<0x014>; +defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x008>; +defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x009>; +defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00a>; +defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00b>; +defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00c>; +defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00f>; +defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x010>; +defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x011>; +defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x012>; +defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x013>; +defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x014>; defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>; defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>; defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>; -defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01b>; -defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01c>; -defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01d>; +defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01b>; +defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01c>; +defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01d>; defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>; defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>; defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>; @@ -1436,6 +1818,13 @@ multiclass VOP2_Real_MADK_vi <bits<6> op> { VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; } +multiclass VOP2_Real_MADK_gfx940 <bits<6> op> { + def _gfx940 : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX940>, + VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl> { + let DecoderNamespace = "GFX9"; + } +} + multiclass VOP2_Real_e32_vi <bits<6> op> { def _e32_vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, @@ -1736,6 +2125,11 @@ let SubtargetPredicate = isGFX90APlus in { } } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = HasFmaakFmamkF32Insts in { +defm V_FMAMK_F32 : VOP2_Real_MADK_gfx940 <0x17>; +defm V_FMAAK_F32 : VOP2_Real_MADK_gfx940 <0x18>; +} + multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> { def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 494e3aeb6d55..dddd0aacc140 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -6,191 +6,25 @@ // //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// VOP3 Classes -//===----------------------------------------------------------------------===// - -class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { - dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); - - list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; - - list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; - - list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))]; - - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> { - dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)); - dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)); - dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)); - dag clamp_dag = (i1 timm:$clamp); - - list<dag> ret3 = [(set P.DstVT:$vdst, - !if(HasExplicitClamp, - (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag), - (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))]; - - list<dag> ret2 = [(set P.DstVT:$vdst, - !if(HasExplicitClamp, - (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag), - (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))]; - - list<dag> ret1 = [(set P.DstVT:$vdst, - !if(HasExplicitClamp, - (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag), - (DivergentFragOrOp<node, P>.ret src0_dag)))]; - - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { - list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), - (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))]; - - list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), - (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; - - list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))]; - - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { - list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers), - (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), - (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))]; - - list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)), - (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), - (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))]; - - list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))]; - - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3Pat<VOPProfile P, SDPatternOperator node> { - list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; - list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))]; - list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))]; - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { - list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))]; - list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))]; - list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))]; - list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, - !if(!eq(P.NumSrcArgs, 2), ret2, - ret1)); -} - -class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> { - list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, - timm:$cbsz, timm:$abid, timm:$blgp))]; -} - -// Consistently gives instructions a _e64 suffix. -multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = []> { - def _e64 : VOP3_Pseudo<opName, P, pattern>; -} - -class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : - VOP3_Pseudo<OpName, P, - !if(P.HasOpSel, - !if(P.HasModifiers, - getVOP3OpSelModPat<P, node>.ret, - getVOP3OpSelPat<P, node>.ret), - !if(P.HasModifiers, - getVOP3ModPat<P, node>.ret, - !if(P.HasIntClamp, - getVOP3ClampPat<P, node>.ret, - !if (P.IsMAI, - getVOP3MAIPat<P, node>.ret, - getVOP3Pat<P, node>.ret)))), - 0, P.HasOpSel> { - - let IntClamp = P.HasIntClamp; - let AsmMatchConverter = - !if(P.HasOpSel, - "cvtVOP3OpSel", - !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp), - "cvtVOP3", - "")); -} - -multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> { - def _e64 : VOP3InstBase<OpName, P, node>; -} - // Special case for v_div_fmas_{f32|f64}, since it seems to be the // only VOP instruction that implicitly reads VCC. let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in { def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> { let Outs64 = (outs DstRC.RegClass:$vdst); + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; } def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> { let Outs64 = (outs DstRC.RegClass:$vdst); } } -class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> { - bit HasClamp = Clamp; - bit HasOpSel = OpSel; - bit IsPacked = Packed; - bit IsMAI = MAI; -} - -def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>; -def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>; -def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; -def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; -def VOP3_MAI : VOP3Features<0, 0, 0, 1>; - -class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> { - - let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); - let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); - let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); - let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - - let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); - let IsSingle = 1; -} - class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; let IsSingle = 1; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; } def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>; @@ -198,12 +32,22 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>; def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; - let IsSingle = 1; + let IsSingle = 1; let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp"; } +class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + +def DIV_FIXUP_F32_PROF : VOP3_Profile<VOP_F32_F32_F32_F32> { + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; +} + //===----------------------------------------------------------------------===// // VOP3 INTERP //===----------------------------------------------------------------------===// @@ -304,10 +148,10 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteIntMul] in { -defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>; -defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>; -defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>; -defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>; +defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>; +defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>; +defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>; +defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>; } // End SchedRW = [WriteIntMul] } // End isReMaterializable = 1 @@ -367,7 +211,7 @@ let isCommutable = 1 in { } // End isCommutable = 1 defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; -defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>; +defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>; let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>; @@ -419,9 +263,9 @@ def : GCNPat< >; let isReMaterializable = 1 in { -let SubtargetPredicate = isGFX6GFX7GFX10 in { +let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isGFX6GFX7GFX10 +} // End SubtargetPredicate = isGFX6GFX7GFX10Plus let SchedRW = [Write32Bit] in { let SubtargetPredicate = isGFX8Plus in { @@ -430,21 +274,30 @@ defm V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMD } // End SchedRW = [Write32Bit] } // End isReMaterializable = 1 -let SubtargetPredicate = isGFX7Plus in { +def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> { + let HasModifiers = 0; +} +let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; -defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>; +defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] +} // End SubtargetPredicate = isGFX7Plus let isCommutable = 1 in { let SchedRW = [WriteIntMul, WriteSALU] in { +let SubtargetPredicate = isGFX7GFX8GFX9GFX10 in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; +} +let SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" in { +defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32_gfx11", VOP3b_I64_I1_I32_I32_I64>; +defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32_gfx11", VOP3b_I64_I1_I32_I32_I64>; +} // End SubtargetPredicate = isGFX11Only, Constraints = "@earlyclobber $vdst" } // End SchedRW = [WriteIntMul, WriteSALU] } // End isCommutable = 1 -} // End SubtargetPredicate = isGFX7Plus let FPDPRounding = 1 in { let Predicates = [Has16BitInsts, isGFX8Only] in { @@ -557,7 +410,7 @@ defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>; } // End Predicates = [Has16BitInsts, isGFX10Plus] -class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< +class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be @@ -587,7 +440,9 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< return true; }]> { let PredicateCodeUsesOperands = 1; +} +class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -609,6 +464,23 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< }]; } +def shl_0_to_4 : PatFrag< + (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), + [{ + if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + return C->getZExtValue() <= 4; + } + return false; + }]> { + let GISelPredicateCode = [{ + int64_t Imm = 0; + if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) && + !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm)))) + return false; + return (uint64_t)Imm <= 4; + }]; +} + let SubtargetPredicate = isGFX9Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -649,6 +521,10 @@ defm V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; } // End isReMaterializable = 1 +// V_LSHL_ADD_U64: D0.u64 = (S0.u64 << S1.u[2:0]) + S2.u64 +// src0 is shifted left by 0-4 (use “0” to get ADD_U64). +let SubtargetPredicate = isGFX940Plus in +defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>; class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. @@ -664,6 +540,12 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>; def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>; def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>; +let SubtargetPredicate = isGFX940Plus in +def : GCNPat< + (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; + def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>; @@ -688,6 +570,33 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>; def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>; } // End SubtargetPredicate = isGFX9Plus +// FIXME: GlobalISel in general does not handle instructions with 2 results, +// so it cannot use these patterns. +multiclass IMAD32_Pats <VOP3_Pseudo inst> { + def : GCNPat < + (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2), + (EXTRACT_SUBREG (inst $src0, $src1, + (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized + $src2, sub0, + (i32 (IMPLICIT_DEF)), sub1), + 0 /* clamp */), + sub0) + >; + // Immediate src2 in the pattern above will not fold because it would be partially + // undef. Hence define specialized pattern for this case. + // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts, + // make it SDAG only. + def : GCNPat < + (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)), + (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0) + >; +} + +let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow +defm : IMAD32_Pats<V_MAD_U64_U32_e64>; +let SubtargetPredicate = isGFX11Only in +defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>; + def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> { let Src0RC64 = VRegSrc_32; let Src1RC64 = SCSrc_b32; @@ -697,6 +606,8 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3 IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2, VGPR_32:$vdst_in, op_sel0:$op_sel); let HasClamp = 0; + let HasExtVOP3DPP = 0; + let HasExtDPP = 0; } class PermlanePat<SDPatternOperator permlane, @@ -753,6 +664,20 @@ let SubtargetPredicate = isGFX10Plus in { def : PermlaneDiscardVDstIn< BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>, V_PERMLANEX16_B32_e64>; + + defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>; + defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>; + + def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>; + def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>; + + // Undo sub x, c -> add x, -c canonicalization since c is more likely + // an inline immediate than -c. + def : GCNPat< + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0) + >; + } // End SubtargetPredicate = isGFX10Plus class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< @@ -773,6 +698,36 @@ def : DivFmasPat<f32, V_DIV_FMAS_F32_e64, VCC_LO>; def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>; } +class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> { + // FIXME VOP3 DPP versions are unsupported + let HasExtVOP3DPP = 0; + let HasClamp = 0; + let HasOMod = 0; + let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, + NumSrcArgs, HasClamp, HasOMod, + !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods), + !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods), + !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret; +} + +let SubtargetPredicate = isGFX11Plus in { + defm V_MAXMIN_F32 : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; + defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; + defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; + defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; + defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; + defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>; +} // End SubtargetPredicate = isGFX11Plus + +let SubtargetPredicate = HasDot8Insts in { + defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>; + defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>; +} + //===----------------------------------------------------------------------===// // Integer Clamp Patterns //===----------------------------------------------------------------------===// @@ -813,16 +768,137 @@ def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>; def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>; def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>; - //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; +defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11<0x20a>; +defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11<0x20b>; +defm V_CUBEID_F32 : VOP3_Realtriple_gfx11<0x20c>; +defm V_CUBESC_F32 : VOP3_Realtriple_gfx11<0x20d>; +defm V_CUBETC_F32 : VOP3_Realtriple_gfx11<0x20e>; +defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11<0x20f>; +defm V_BFE_U32 : VOP3_Realtriple_gfx11<0x210>; +defm V_BFE_I32 : VOP3_Realtriple_gfx11<0x211>; +defm V_BFI_B32 : VOP3_Realtriple_gfx11<0x212>; +defm V_FMA_F32 : VOP3_Realtriple_gfx11<0x213>; +defm V_FMA_F64 : VOP3_Real_Base_gfx11<0x214>; +defm V_LERP_U8 : VOP3_Realtriple_gfx11<0x215>; +defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11<0x216>; +defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11<0x217>; +defm V_MULLIT_F32 : VOP3_Realtriple_gfx11<0x218>; +defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>; +defm V_MIN3_I32 : VOP3_Realtriple_gfx11<0x21a>; +defm V_MIN3_U32 : VOP3_Realtriple_gfx11<0x21b>; +defm V_MAX3_F32 : VOP3_Realtriple_gfx11<0x21c>; +defm V_MAX3_I32 : VOP3_Realtriple_gfx11<0x21d>; +defm V_MAX3_U32 : VOP3_Realtriple_gfx11<0x21e>; +defm V_MED3_F32 : VOP3_Realtriple_gfx11<0x21f>; +defm V_MED3_I32 : VOP3_Realtriple_gfx11<0x220>; +defm V_MED3_U32 : VOP3_Realtriple_gfx11<0x221>; +defm V_SAD_U8 : VOP3_Realtriple_gfx11<0x222>; +defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11<0x223>; +defm V_SAD_U16 : VOP3_Realtriple_gfx11<0x224>; +defm V_SAD_U32 : VOP3_Realtriple_gfx11<0x225>; +defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11<0x226>; +defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11<0x227>; +defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11<0x228>; +defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11<0x237>; +defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11<0x238>; +defm V_MSAD_U8 : VOP3_Realtriple_gfx11<0x239>; +defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23a>; +defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23b>; +defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11<0x23d>; +defm V_XOR3_B32 : VOP3_Realtriple_gfx11<0x240>; +defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11<0x241, "V_MAD_U16_gfx9", "v_mad_u16">; +defm V_PERM_B32 : VOP3_Realtriple_gfx11<0x244>; +defm V_XAD_U32 : VOP3_Realtriple_gfx11<0x245>; +defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11<0x246>; +defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11<0x247>; +defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11<0x248, "V_FMA_F16_gfx9", "v_fma_f16">; +defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>; +defm V_MIN3_I16 : VOP3_Realtriple_gfx11<0x24a>; +defm V_MIN3_U16 : VOP3_Realtriple_gfx11<0x24b>; +defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>; +defm V_MAX3_I16 : VOP3_Realtriple_gfx11<0x24d>; +defm V_MAX3_U16 : VOP3_Realtriple_gfx11<0x24e>; +defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>; +defm V_MED3_I16 : VOP3_Realtriple_gfx11<0x250>; +defm V_MED3_U16 : VOP3_Realtriple_gfx11<0x251>; +defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11<0x253, "V_MAD_I16_gfx9", "v_mad_i16">; +defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; +defm V_ADD3_U32 : VOP3_Realtriple_gfx11<0x255>; +defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11<0x256>; +defm V_AND_OR_B32 : VOP3_Realtriple_gfx11<0x257>; +defm V_OR3_B32 : VOP3_Realtriple_gfx11<0x258>; +defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11<0x259>; +defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11<0x25a>; +defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11<0x25b>; +defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11<0x25c>; +defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>; +defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>; +defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>; +defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>; +defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>; +defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>; +defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>; +defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>; +// FIXME VOP3 DPP Dot instructions are unsupported +defm V_DOT2_F16_F16 : VOP3_Real_Base_gfx11<0x266>; +defm V_DOT2_BF16_BF16 : VOP3_Real_Base_gfx11<0x267>; +defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; +defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; +defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; +defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; +defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>; +defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>; +defm V_MUL_LO_U16 : VOP3Only_Realtriple_gfx11<0x305>; +defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11<0x306>; +defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11<0x307>; +defm V_MAX_U16 : VOP3Only_Realtriple_gfx11<0x309>; +defm V_MAX_I16 : VOP3Only_Realtriple_gfx11<0x30a>; +defm V_MIN_U16 : VOP3Only_Realtriple_gfx11<0x30b>; +defm V_MIN_I16 : VOP3Only_Realtriple_gfx11<0x30c>; +defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30d, "V_ADD_I16", "v_add_nc_i16">; +defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30e, "V_SUB_I16", "v_sub_nc_i16">; +defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11<0x311>; +defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >; +defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >; +defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x325, "V_SUB_I32", "v_sub_nc_i32">; +defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x326, "V_ADD_I32", "v_add_nc_i32">; +defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>; +defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; +defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; +defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; +defm V_LDEXP_F64 : VOP3_Real_Base_gfx11<0x32b>; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11<0x32e>; +defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11<0x32f>; +defm V_LSHLREV_B16 : VOP3Only_Realtriple_gfx11<0x338>; +defm V_LSHRREV_B16 : VOP3Only_Realtriple_gfx11<0x339>; +defm V_ASHRREV_I16 : VOP3Only_Realtriple_gfx11<0x33a>; +defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>; +defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11<0x33d>; +defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11<0x33e>; +defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11<0x360>; // Pseudo in VOP2 +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11<0x361>; // Pseudo in VOP2 +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) +defm V_AND_B16 : VOP3Only_Realtriple_gfx11<0x362>; +defm V_OR_B16 : VOP3Only_Realtriple_gfx11<0x363>; +defm V_XOR_B16 : VOP3Only_Realtriple_gfx11<0x364>; + +//===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { multiclass VOP3_Real_gfx10<bits<10> op> { def _gfx10 : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>, @@ -867,7 +943,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { let AsmString = asmName # ps.AsmOperands; } } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; @@ -935,10 +1011,11 @@ defm V_MAD_I16 : defm V_DIV_FIXUP_F16 : VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">; +defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>; +defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>; + // FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these // (they do not support SDWA or DPP). -defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">; -defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">; defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">; defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">; defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">; @@ -1273,3 +1350,5 @@ defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>; defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>; defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>; + +defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 707475ceccee..59ce532af59b 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -10,19 +10,33 @@ // VOP3P Classes //===----------------------------------------------------------------------===// +class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, + bit HasDPP = 0> : VOP3_Profile<P, Features> { + let IsVOP3P = 1; + let HasExtVOP3DPP = HasDPP; + // We do not want to print src modifiers for vop3p because the bits are + // overloaded in meaning and the logic in printOperandAndFPInputMods is + // wrong for vop3p + let AsmVOP3DPPBase = AsmVOP3P; +} + // Used for FMA_MIX* and MAD_MIX* insts // Their operands are only sort of f16 operands. Depending on // op_sel_hi, these may be interpreted as f32. The inline immediate // values are really f16 converted to f32, so we treat these as f16 // operands. class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, - bit useTiedOutput = 0> : VOP3_Profile<P, Features> { + bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> { bit UseTiedOutput = useTiedOutput; dag srcs = (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + dag dpp_srcs = + (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, + FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); // FIXME: clampmod0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -35,19 +49,27 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, // We use Ins64 because that is the one which populates InOperandList // due to the logic in class VOP3_Pseudo let Ins64 = !con(srcs, mods); + let InsVOP3Base = !con(dpp_srcs, mods); let Asm64 = "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; + let AsmVOP3DPPBase = Asm64; } multiclass VOP3PInst<string OpName, VOPProfile P, - SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> { + SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo<OpName, P, !if (P.HasModifiers, - getVOP3PModPat<P, node, HasExplicitClamp>.ret, + getVOP3PModPat<P, node, IsDOT, IsDOT>.ret, getVOP3Pat<P, node>.ret)>; + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo<OpName, P> { + let VOP3P = 1; + let PseudoInstr = OpName #"_dpp"; + } + } // end SubtargetPredicate = isGFX11Plus } - // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { @@ -55,37 +77,47 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); } + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo<OpName, P> { + let VOP3P = 1; + let PseudoInstr = OpName#"_dpp"; + let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); + } + } // end SubtargetPredicate = isGFX11Plus } +let isReMaterializable = 1 in { let isCommutable = 1 in { -defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; -defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; let FPDPRounding = 1 in { -defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; -defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; -defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; +defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; +defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; +defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; } // End FPDPRounding = 1 -defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; -defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; +defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; +defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; -defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; -defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; +defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>; +defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>; +defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>; -defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; -defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; -defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; -defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; +defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>; +defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>; +defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>; +defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>; } -defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; - -defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>; -defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>; -defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>; +defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>; +defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>; +defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>; +defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>; +defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>; +} // End isReMaterializable = 1 let SubtargetPredicate = HasVOP3PInsts in { @@ -178,6 +210,7 @@ let SubtargetPredicate = HasMadMixInsts in { // Size of src arguments (16/32) is controlled by op_sel. // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. let isCommutable = 1, mayRaiseFPException = 0 in { +let isReMaterializable = 1 in defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; let FPDPRounding = 1 in { @@ -197,6 +230,8 @@ defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; // Essentially the same as the mad_mix versions let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { + +let isReMaterializable = 1 in defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; let FPDPRounding = 1 in { @@ -297,34 +332,63 @@ let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", - VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; + VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", - VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; + VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; } // End SubtargetPredicate = HasDot2Insts let SubtargetPredicate = HasDot7Insts in { defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", - VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, + VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, AMDGPUfdot2, 1/*ExplicitClamp*/>; defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", - VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", - VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; } // End SubtargetPredicate = HasDot7Insts let SubtargetPredicate = HasDot1Insts in { defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", - VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", - VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; + VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; } // End SubtargetPredicate = HasDot1Insts + +let SubtargetPredicate = HasDot8Insts in { + +defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", + VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, + int_amdgcn_fdot2_f32_bf16, 1>; + +} // End SubtargetPredicate = HasDot8Insts + } // End let IsDOT = 1 +multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> { + let IsDOT = 1 in + defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, + null_frag, 1>; + // Dot-iu instructions consider input as signed if imod neg bits are set. Thus + // Dot-iu Intrinsics have extra operands and require separate codegen pattern. + def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0, + (DotIUVOP3PMods i32:$src1_mods), i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (!cast<Instruction>(NAME) $src0_mods, i32:$src0, + $src1_mods, i32:$src1, + (i32 8), i32:$src2, i1:$clamp) + >; +} + +let SubtargetPredicate = HasDot8Insts in { +defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; +defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; +} // End SubtargetPredicate = HasDot8Insts + def : UDot2Pat<V_DOT2_U32_U16>; def : SDot2Pat<V_DOT2_I32_I16>; @@ -365,18 +429,18 @@ def VDst_256 : VOPDstOperand<VReg_256>; def VDst_512 : VOPDstOperand<VReg_512>; def VDst_1024 : VOPDstOperand<VReg_1024>; -def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { +def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { let Src0RC64 = ARegSrc_32; } -def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { +def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> { let DstRC = ADst_32; - let Src0RC64 = VISrc_b32; + let Src0RC64 = VCSrc_b32; } class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, RegisterOperand SrcABRC = AVSrc_32> - : VOP3_Profile<P, VOP3_MAI> { + : VOP3P_Profile<P, VOP3_MAI> { let DstRC = _DstRC; let Src0RC64 = SrcABRC; let Src1RC64 = SrcABRC; @@ -387,15 +451,27 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC let HasOMod = 0; let HasModifiers = 0; let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; + let AsmVOP3DPPBase = Asm64; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); + let InsVOP3Base = Ins64; // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. // We then create two versions of the instruction: with tied dst and src2 - // and with the eralyclobber flag on the dst. This is strciter than the + // and with the earlyclobber flag on the dst. This is stricter than the // actual HW restriction. In particular earlyclobber also affects src0 and // src1 allocation which is not required. bit NoDstOverlap = !gt(DstVT.Size, 128); } +class VOPProfileSMFMAC<VOPProfile P, RegisterOperand _DstRC, + RegisterOperand _SrcARC, RegisterOperand _SrcBRC> + : VOPProfileMAI<P, _DstRC, _DstRC, _SrcARC> { + let Src1RC64 = _SrcBRC; + let Src2VT = DstVT; + let Asm64 = " $vdst, $src0, $src1, $idx$cbsz$abid"; + let Outs64 = (outs DstRC:$vdst); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, VRegSrc_32:$idx, cbsz:$cbsz, abid:$abid, Src2RC64:$src2); +} + def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; @@ -413,6 +489,10 @@ def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, A def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>; def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>; +def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>; +def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>; +def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; +def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>; def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>; @@ -431,12 +511,37 @@ def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; +def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>; +def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>; +def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; +def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; + +def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_I32_16X16X64_I8 : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32, AVDst_128, AVSrc_64, AVSrc_128>; +def VOPProfileSMFMAC_I32_32X32X32_I8 : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>; class MFMATable <bit is_mac, string Name> { bit IsMac = is_mac; string FMAOp = Name; } +class MAIFrag<SDPatternOperator Op, code pred> : PatFrag < + (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$abid, node:$blgp), + (Op $src0, $src1, $src2, $cbsz, $abid, $blgp), + pred +>; + +let GISelPredicateCode = [{ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in +class AgprMAIFrag<SDPatternOperator Op> : + MAIFrag<Op, [{ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>; + +let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in +class VgprMAIFrag<SDPatternOperator Op> : + MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>; + let Predicates = [HasMAIInsts] in { let isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -446,47 +551,62 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in { } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 +class MAIInst<string OpName, VOPProfile P, SDPatternOperator node> + : VOP3InstBase<OpName, P, node> { + Instruction Opcode = !cast<Instruction>(NAME); + bit is_dgemm = 0; + bit is_gfx940_xdl = 0; +} + multiclass MAIInst<string OpName, string P, SDPatternOperator node, bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> { let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { - defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>, - MFMATable<0, NAME # "_e64">; + def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), + !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>, + MFMATable<0, NAME # "_e64">; let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in - defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, - MFMATable<0, NAME # "_vgprcd_e64">; + def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>, + MFMATable<0, NAME # "_vgprcd_e64">; } foreach _ = BoolToList<NoDstOverlap>.ret in { let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>, - MFMATable<1, NAME # "_e64">; + def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>, + MFMATable<1, NAME # "_e64">; let SubtargetPredicate = isGFX90APlus in - defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, - MFMATable<1, NAME # "_vgprcd_e64">; + def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), + VgprMAIFrag<node>>, + MFMATable<1, NAME # "_vgprcd_e64">; } } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; -defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; -defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; +defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; +defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; + +let is_gfx940_xdl = 1 in { +defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; +defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>; defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>; defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>; -defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; -defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>; defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>; defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>; +} + +let Predicates = [isGFX908orGFX90A] in { defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>; defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>; defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>; @@ -494,34 +614,314 @@ defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>; defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>; defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>; +} } // End SubtargetPredicate = HasMAIInsts let Predicates = [isGFX90APlus] in { + let is_gfx940_xdl = 1 in { defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>; defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>; defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>; defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>; + } + let is_dgemm = 1 in { defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>; defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; + } } // End Predicates = [isGFX90APlus] -let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { - defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; - defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; - defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; - defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; +let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in { + defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>; + defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>; + defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>; + defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>; +} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1 + +multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> { + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", + isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1, is_gfx940_xdl = 1 in { + def _e64 : MAIInst<OpName, !cast<VOPProfileSMFMAC>("VOPProfileSMFMAC_" # P), node>; + } +} + +let SubtargetPredicate = isGFX940Plus in { +defm V_SMFMAC_F32_16X16X32_F16 : SMFMACInst<"v_smfmac_f32_16x16x32_f16", "F32_16X16X32_F16", int_amdgcn_smfmac_f32_16x16x32_f16>; +defm V_SMFMAC_F32_32X32X16_F16 : SMFMACInst<"v_smfmac_f32_32x32x16_f16", "F32_32X32X16_F16", int_amdgcn_smfmac_f32_32x32x16_f16>; +defm V_SMFMAC_F32_16X16X32_BF16 : SMFMACInst<"v_smfmac_f32_16x16x32_bf16", "F32_16X16X32_I16", int_amdgcn_smfmac_f32_16x16x32_bf16>; +defm V_SMFMAC_F32_32X32X16_BF16 : SMFMACInst<"v_smfmac_f32_32x32x16_bf16", "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>; +defm V_SMFMAC_I32_16X16X64_I8 : SMFMACInst<"v_smfmac_i32_16x16x64_i8", "I32_16X16X64_I8", int_amdgcn_smfmac_i32_16x16x64_i8>; +defm V_SMFMAC_I32_32X32X32_I8 : SMFMACInst<"v_smfmac_i32_32x32x32_i8", "I32_32X32X32_I8", int_amdgcn_smfmac_i32_32x32x32_i8>; +} + +def MAIInstInfoTable : GenericTable { + let FilterClass = "MAIInst"; + let CppTypeName = "MAIInstInfo"; + let Fields = [ + "Opcode", "is_dgemm", "is_gfx940_xdl" + ]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMAIInstInfoHelper"; +} + +let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in { + defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; + defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; + defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; + defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; } // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; +class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bit _HasClamp, bit _HasOpSel> : VOP3P_Profile<P> { + let DstRC = !if(!eq(Suffix, "_w32"), VDst_256, VDst_128); + let Src0RC64 = _Src01RC64; + let Src1RC64 = _Src01RC64; + let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32); + let HasClamp = _HasClamp; + let HasOpSel = _HasOpSel; + let IsPacked = 1; + let IsWMMA = 1; +} + +def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>; +def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>; +def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>; +def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>; +def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>; +def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>; + +def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>; +def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>; +def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>; +def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>; +def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>; +def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>; + + +class WMMAType <bits<2> val> { + bit hasClamp = val{0}; + bit hasOpsel = val{1}; +} + +def WMMARegular : WMMAType<0b00>; +def WMMAUIClamp : WMMAType<0b01>; +def WMMAOpSel : WMMAType<0b10>; + +class WMMARegularPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : + GCNPat < (P.DstVT (node + (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)) + )), + (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, $src2_modifiers, P.Src2VT:$src2)) +>; + +class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : + GCNPat < (P.DstVT (node + (P.Src0VT P.Src0VT:$src0), + (P.Src1VT P.Src1VT:$src1), + (P.Src2VT P.Src2VT:$src2), (WMMAOpSelVOP3PMods i32:$src2_modifiers) + )), + (P.DstVT (Inst (i32 8), P.Src0VT:$src0, (i32 8), P.Src1VT:$src1, i32:$src2_modifiers, P.Src2VT:$src2)) +>; + +class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> : + GCNPat < (P.DstVT (node + (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0), + (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1), + (P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp) + )), + (P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp)) +>; + +class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { + Instruction Opcode2Addr = TwoAddr; + Instruction Opcode3Addr = ThreeAddr; + Predicate WaveSizePredicate; +} + +def WMMAOpcode : GenericEnum { + let FilterClass = "VOP3P_Pseudo"; +} + +class WMMAMappingTable : GenericTable { + let FilterClass = "WMMAOpcodeMapping"; + let CppTypeName = "WMMAOpcodeMappingInfo"; + let Fields = ["Opcode2Addr", "Opcode3Addr"]; + string TypeOf_Opcode2Addr = "WMMAOpcode"; + string TypeOf_Opcode3Addr = "WMMAOpcode"; +} + +def WMMAOpcode2AddrMappingTable : WMMAMappingTable { + let PrimaryKey = ["Opcode2Addr"]; + let PrimaryKeyName = "getWMMAMappingInfoFrom2AddrOpcode"; +} + +def WMMAOpcode3AddrMappingTable : WMMAMappingTable { + let PrimaryKey = ["Opcode3Addr"]; + let PrimaryKeyName = "getWMMAMappingInfoFrom3AddrOpcode"; +} + +// The WMMA instruction has extra constraints: +// Matrices A and B cannot overlap with D. C cannot partially overlap with D, +// but it is OK for them to be the same (which is a typical case). +// +// We implement it as follows: +// 1) Map the intrinsic to the pseudo where D is tied to C ($vdst = $src2). +// 2) The pass twoaddressinstruction checks if src2 is live and if that is the case +// it converts the default pseudo to the pseudo where src2 is not the same as vdst. +// 3) @earlyclobber on the destination satisfies the constraint during RA. + +multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> { + + defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; + defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; + + defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>; + if !eq(Suffix, "_w32") then { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; + } + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; + } + } + def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32), + !cast<Instruction>(NAME # _threeaddr_w32)>; + } else if !eq(Suffix, "_w64") then { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; + } + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>; + } + } + def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64), + !cast<Instruction>(NAME # _threeaddr_w64)>; + } + + if !eq(Type, WMMAOpSel) then { + def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; + } else if !eq(Type, WMMAUIClamp) then { + def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; + } else { + def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>; + } +} + + +let WaveSizePredicate = isWave32 in { + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; +} + +let WaveSizePredicate = isWave64 in { + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + +} + //===----------------------------------------------------------------------===// // Begin Real Encodings //===----------------------------------------------------------------------===// +class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget, + string opName = ps.OpName> + : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let OtherPredicates = ps.OtherPredicates; +} + +class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName> + : VOP3P_DPP8<op, opName, ps.Pfl> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; +} + +//===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Plus, + DecoderNamespace = "GFX11" in { + + multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name), + SIEncodingFamily.GFX11, asmName>, + VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>; + } + + multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); + def _dpp_gfx11 + : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"), + SIEncodingFamily.GFX11> { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11"; + } + } + + multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { + defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name); + def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> { + let AsmString = asmName #ps.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11"; + } + } + + multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME, + string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> + : VOP3P_Real_gfx11<op, backing_ps_name, asmName>, + VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>, + VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>; +} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" + +defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>; +defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>; +defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>; + +multiclass VOP3P_Real_WMMA <bits<7> op> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in { + defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>; + } + let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in { + defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>; + } +} + +defm V_WMMA_F32_16X16X16_F16 : VOP3P_Real_WMMA <0x040>; +defm V_WMMA_F32_16X16X16_BF16 : VOP3P_Real_WMMA <0x041>; +defm V_WMMA_F16_16X16X16_F16 : VOP3P_Real_WMMA <0x042>; +defm V_WMMA_BF16_16X16X16_BF16 : VOP3P_Real_WMMA <0x043>; +defm V_WMMA_I32_16X16X16_IU8 : VOP3P_Real_WMMA <0x044>; +defm V_WMMA_I32_16X16X16_IU4 : VOP3P_Real_WMMA <0x045>; + //===----------------------------------------------------------------------===// // GFX8 (VI) //===----------------------------------------------------------------------===// @@ -557,15 +957,64 @@ multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>; } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" } +} + +multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string Op, + VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(Op # "_e64"), + VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"), + VOPProfile Pfl_ACD = PS_ACD.Pfl, + VOPProfile Pfl_VCD = PS_VCD.Pfl> { + let Predicates = [isGFX940Plus] in { + foreach _ = BoolToList<!ne(NameFrom, NameTo)>.ret in { + def : InstAlias <NameTo # " " # PS_ACD.AsmOperands, + (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst, + Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2, + cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; + def : InstAlias <NameTo # " " # PS_VCD.AsmOperands, + (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst, + Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2, + cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl; + } + } // End Predicates = [isGFX940Plus] +} + +multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic, + VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"), + VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> { + let SubtargetPredicate = isGFX940Plus, + AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9", + AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in { + def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>, + VOP3Pe_MAI <op, PS_ACD.Pfl, 1>; + + def _gfx940_vcd : VOP3P_Real<PS_VCD, SIEncodingFamily.GFX940>, + VOP3Pe_MAI <op, PS_VCD.Pfl, 0>; + } // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" -multiclass VOP3P_Real_MFMA<bits<7> op> : - VOP3P_Real_MFMA_gfx90a <op> { + defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>; + + foreach _ = BoolToList<!ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic)>.ret in + defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>; +} + +multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> : + VOP3P_Real_MFMA_gfx90a <op>, + VOP3P_Real_MFMA_gfx940 <op, GFX940Name> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; + let Constraints = ""; } } + +multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> { + def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3Pe_SMFMAC <op> { + let AssemblerPredicate = isGFX940Plus; + let DecoderNamespace = "GFX8"; + } + def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>; } defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; @@ -634,19 +1083,21 @@ let SubtargetPredicate = HasMAIInsts in { defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>; defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>; -defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40>; -defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41>; -defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42>; -defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44>; -defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45>; -defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48>; -defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49>; -defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a>; -defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c>; -defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>; -defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50>; -defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51>; -defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52>; +defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40, "v_mfma_f32_32x32x1_2b_f32">; +defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41, "v_mfma_f32_16x16x1_4b_f32">; +defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42, "v_mfma_f32_4x4x1_16b_f32">; +defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44, "v_mfma_f32_32x32x2_f32">; +defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45, "v_mfma_f32_16x16x4_f32">; +defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48, "v_mfma_f32_32x32x4_2b_f16">; +defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49, "v_mfma_f32_16x16x4_4b_f16">; +defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a, "v_mfma_f32_4x4x4_16b_f16">; +defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c, "v_mfma_f32_32x32x8_f16">; +defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d, "v_mfma_f32_16x16x16_f16">; +defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">; +defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">; +defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">; + +let SubtargetPredicate = isGFX908orGFX90A in { defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>; defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>; defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>; @@ -654,6 +1105,7 @@ defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>; defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>; defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>; defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>; +} } // End SubtargetPredicate = HasMAIInsts @@ -665,6 +1117,27 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; +defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; +defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; +defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">; +defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">; + +defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">; +defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">; +defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">; +defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx940 <0x60, "v_mfma_f32_32x32x8_bf16">; +defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx940 <0x61, "v_mfma_f32_16x16x16_bf16">; + +defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx940 <0x6e, "v_mfma_f64_16x16x4_f64">; +defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx940 <0x6f, "v_mfma_f64_4x4x4_4b_f64">; + +defm V_SMFMAC_F32_16X16X32_F16 : VOP3P_Real_SMFMAC <0x62, "v_smfmac_f32_16x16x32f16">; +defm V_SMFMAC_F32_32X32X16_F16 : VOP3P_Real_SMFMAC <0x64, "v_smfmac_f32_32x32x16f16">; +defm V_SMFMAC_F32_16X16X32_BF16 : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x16x32bf16">; +defm V_SMFMAC_F32_32X32X16_BF16 : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">; +defm V_SMFMAC_I32_16X16X64_I8 : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">; +defm V_SMFMAC_I32_32X32X32_I8 : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">; + let SubtargetPredicate = HasPackedFP32Ops in { defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; @@ -676,35 +1149,41 @@ let SubtargetPredicate = HasPackedFP32Ops in { // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in { +let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in { multiclass VOP3P_Real_gfx10<bits<7> op> { def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; } -} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 +} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 + +multiclass VOP3P_Real_gfx10_gfx11<bits<7> op> + : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>; + +multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op> + : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>; -defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>; -defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>; -defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>; -defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>; -defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>; -defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>; -defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>; -defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>; -defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>; -defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>; -defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>; -defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>; -defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>; -defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>; -defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>; -defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>; -defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>; -defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>; -defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>; -defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>; -defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>; -defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>; +defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>; +defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>; +defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>; +defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>; +defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>; +defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>; +defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>; +defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>; +defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>; +defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>; +defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>; +defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>; +defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>; +defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>; +defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>; +defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>; +defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>; +defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>; +defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>; +defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>; let SubtargetPredicate = HasDot2Insts in { @@ -715,9 +1194,9 @@ defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; let SubtargetPredicate = HasDot7Insts in { -defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; -defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>; -defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>; +defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>; +defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>; +defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>; } // End SubtargetPredicate = HasDot7Insts diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index c0cc91029d11..eb6c54a45263 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -49,12 +49,36 @@ class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { // an explicit $dst. class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { + // We want to exclude instructions with 64bit operands + let HasExtDPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret; let Asm32 = "$src0, $src1"; + + let AsmDPP = !if (HasModifiers, + "$src0_modifiers, $src1_modifiers " + "$dpp_ctrl$row_mask$bank_mask$bound_ctrl", + "$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"); + let AsmDPP8 = "$src0, $src1 $dpp8$fi"; + let AsmDPP16 = AsmDPP#"$fi"; + let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, + Src2ModDPP>.ret; + let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, + Src2ModDPP>.ret; + let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP, + NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP, + Src2ModDPP>.ret; + // The destination for 32-bit encoding is implicit. let HasDst32 = 0; // VOPC disallows dst_sel and dst_unused as they have no effect on destination let EmitDstSel = 0; let Outs64 = (outs VOPDstS64orS32:$sdst); + let OutsVOP3DPP = Outs64; + let OutsVOP3DPP8 = Outs64; + let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; + let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; + let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret; list<SchedReadWrite> Schedule = sched; } @@ -62,12 +86,15 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> : VOPC_Profile<sched, vt0, vt1> { let Outs64 = (outs ); + let OutsVOP3DPP = Outs64; + let OutsVOP3DPP8 = Outs64; let OutsSDWA = (outs ); let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm64 = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp", "$src0, $src1"); + let AsmVOP3DPPBase = Asm64; let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let EmitDst = 0; } @@ -100,8 +127,8 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[], VOPProfile Pfl = P; } -class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : - InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>, +class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily, string asm_name = ps.PseudoInstr> : + InstSI <ps.OutOperandList, ps.InOperandList, asm_name # " " # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let VALU = 1; @@ -133,8 +160,9 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, - string Asm32 = ps.Pfl.Asm32, VOPProfile p = ps.Pfl> : - InstAlias <ps.OpName#" "#Asm32, (inst)>, PredicateControl { + string Asm32 = ps.Pfl.Asm32, string real_name = ps.OpName, + VOPProfile p = ps.Pfl> : + InstAlias <real_name#" "#Asm32, (inst)>, PredicateControl { field bit isCompare; field bit isCommutable; @@ -167,27 +195,32 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, let SubtargetPredicate = AssemblerPredicate; } -multiclass VOPCInstAliases <string OpName, string Arch> { - def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), - !cast<Instruction>(OpName#"_e32_"#Arch)>; +multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name> { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"), + !cast<Instruction>(real_name#"_e32_"#Arch), + !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32, + real_name>; let WaveSizePredicate = isWave32 in { - def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), - !cast<Instruction>(OpName#"_e32_"#Arch), - "vcc_lo, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>; + def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"), + !cast<Instruction>(real_name#"_e32_"#Arch), + "vcc_lo, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32, + real_name>; } let WaveSizePredicate = isWave64 in { - def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), - !cast<Instruction>(OpName#"_e32_"#Arch), - "vcc, "#!cast<VOP3_Pseudo>(OpName#"_e64").Pfl.Asm32>; + def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"), + !cast<Instruction>(real_name#"_e32_"#Arch), + "vcc, "#!cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32, + real_name>; } } -multiclass VOPCXInstAliases <string OpName, string Arch> { - def : VOPCInstAlias <!cast<VOP3_Pseudo>(OpName#"_e64"), - !cast<Instruction>(OpName#"_e32_"#Arch)>; +multiclass VOPCXInstAliases <string old_name, string Arch, string real_name = old_name> { + def : VOPCInstAlias <!cast<VOP3_Pseudo>(old_name#"_e64"), + !cast<Instruction>(real_name#"_e32_"#Arch), + !cast<VOP3_Pseudo>(old_name#"_e64").Pfl.Asm32, + real_name>; } - class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set i1:$sdst, @@ -205,6 +238,11 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> { string NoSDstOp = Name; } +class VCMPVCMPXTable <string Name> { + bit IsVCMPX = 0; + string VCMPOp = Name; +} + multiclass VOPC_Pseudos <string opName, VOPC_Profile P, SDPatternOperator cond = COND_NULL, @@ -213,7 +251,8 @@ multiclass VOPC_Pseudos <string opName, def _e32 : VOPC_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>, - VCMPXNoSDstTable<1, opName#"_e32"> { + VCMPXNoSDstTable<1, opName#"_e32">, + VCMPVCMPXTable<opName#"_e32"> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = P.Schedule; let isConvergent = DefExec; @@ -223,7 +262,8 @@ multiclass VOPC_Pseudos <string opName, def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>, - VCMPXNoSDstTable<1, opName#"_e64"> { + VCMPXNoSDstTable<1, opName#"_e64">, + VCMPVCMPXTable<opName#"_e64"> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; let isCompare = 1; @@ -237,6 +277,26 @@ multiclass VOPC_Pseudos <string opName, let isConvergent = DefExec; let isCompare = 1; } + + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtDPP then + def _e32_dpp : VOP_DPP_Pseudo<opName, P> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + let VOPC = 1; + let Constraints = ""; + } + if P.HasExtVOP3DPP then + def _e64_dpp : VOP3_DPP_Pseudo<opName, P> { + let Defs = !if(DefExec, [EXEC], []); + let SchedRW = P.Schedule; + let isCompare = 1; + let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", ""); + } + } // end SubtargetPredicate = isGFX11Plus + } let SubtargetPredicate = HasSdstCMPX in { @@ -248,23 +308,27 @@ multiclass VOPCX_Pseudos <string opName, def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>, Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>, - VCMPXNoSDstTable<0, opName#"_e32"> { + VCMPXNoSDstTable<0, opName#"_e32">, + VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isConvergent = 1; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + let IsVCMPX = 1; } def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>, Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>, - VCMPXNoSDstTable<0, opName#"_e64"> { + VCMPXNoSDstTable<0, opName#"_e64">, + VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; let isCompare = 1; let isCommutable = 1; let SubtargetPredicate = HasNoSdstCMPX; + let IsVCMPX = 1; } foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in @@ -275,6 +339,25 @@ multiclass VOPCX_Pseudos <string opName, let isCompare = 1; let SubtargetPredicate = HasNoSdstCMPX; } + + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtDPP then + def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let isCompare = 1; + let VOPC = 1; + let Constraints = ""; + } + if P.HasExtVOP3DPP then + def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isCompare = 1; + let Constraints = ""; + } + } // end SubtargetPredicate = isGFX11Plus } } // End SubtargetPredicate = HasSdstCMPX @@ -626,8 +709,18 @@ defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">; class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : VOPC_Profile<sched, vt, i32> { + let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let AsmDPP16 = AsmDPP#"$fi"; + let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsDPP16 = !con(InsDPP, (ins FI:$fi)); + // DPP8 forbids modifiers and can inherit from VOPC_Profile + let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + dag InsPartVOP3DPP = (ins Src0Mod:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); + let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), + (ins))); let Asm64 = "$sdst, $src0_modifiers, $src1"; + let AsmVOP3DPPBase = Asm64; let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, @@ -647,6 +740,7 @@ class VOPC_Class_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt> : Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm64 = "$src0_modifiers, $src1"; + let AsmVOP3DPPBase = Asm64; let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let EmitDst = 0; } @@ -684,6 +778,24 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, let SchedRW = p.Schedule; let isConvergent = DefExec; } + + let SubtargetPredicate = isGFX11Plus in { + if p.HasExtDPP then + def _e32_dpp : VOP_DPP_Pseudo<opName, p> { + let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), + !if(DefVcc, [VCC], [])); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + let VOPC = 1; + let Constraints = ""; + } + if p.HasExtVOP3DPP then + def _e64_dpp : VOP3_DPP_Pseudo<opName, p> { + let Defs = !if(DefExec, [EXEC], []); + let SchedRW = p.Schedule; + let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", ""); + } + } // end SubtargetPredicate = isGFX11Plus } let SubtargetPredicate = HasSdstCMPX in { @@ -714,6 +826,23 @@ multiclass VOPCX_Class_Pseudos <string opName, let isConvergent = 1; let SubtargetPredicate = HasNoSdstCMPX; } + + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtDPP then + def _nosdst_e32_dpp : VOP_DPP_Pseudo<opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let isConvergent = 1; + let VOPC = 1; + let Constraints = ""; + } + if P.HasExtVOP3DPP then + def _nosdst_e64_dpp : VOP3_DPP_Pseudo<opName#"_nosdst", P_NoSDst> { + let Defs = [EXEC]; + let SchedRW = P_NoSDst.Schedule; + let Constraints = ""; + } + } // end SubtargetPredicate = isGFX11Plus } } // End SubtargetPredicate = HasSdstCMPX @@ -872,14 +1001,676 @@ defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>; defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>; //===----------------------------------------------------------------------===// +// DPP Encodings +//===----------------------------------------------------------------------===// + +// VOPC32 + +class VOPC_DPPe_Common<bits<8> op> : Enc64 { + bits<8> src1; + let Inst{16-9} = src1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; +} + +class VOPC_DPP_Base<bits<8> op, string OpName, VOPProfile P> + : VOP_DPP_Base<OpName, P, P.InsDPP16, " " #P.AsmDPP16>, + VOPC_DPPe_Common<op> { + bits<2> src0_modifiers; + bits<8> src0; + bits<2> src1_modifiers; + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + bit fi; + + let Inst{8-0} = 0xfa; + + let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0); + let Inst{48-40} = dpp_ctrl; + let Inst{50} = fi; + let Inst{51} = bound_ctrl; + let Inst{52} = !if (P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg + let Inst{53} = !if (P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs + let Inst{54} = !if (P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg + let Inst{55} = !if (P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs + let Inst{59-56} = bank_mask; + let Inst{63-60} = row_mask; + + let AsmMatchConverter = "cvtDPP"; + let VOPC = 1; +} + +class VOPC_DPP8_Base<bits<8> op, string OpName, VOPProfile P> + : VOP_DPP8_Base<OpName, P, P.InsDPP8, " " #P.AsmDPP8>, + VOPC_DPPe_Common<op> { + bits<8> src0; + bits<24> dpp8; + bits<9> fi; + + let Inst{8-0} = fi; + + let Inst{39-32} = !if (P.HasSrc0, src0{7-0}, 0); + let Inst{63-40} = dpp8{23-0}; + + let AsmMatchConverter = "cvtDPP8"; + let VOPC = 1; +} + +class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> + : VOPC_DPP_Base<op, opName, ps.Pfl> { + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; + let Constraints = ps.Constraints; + let AsmMatchConverter = "cvtVOPCNoDstDPP"; +} + +class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget, + string opName = ps.OpName> + : VOPC_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>; + +class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName> + : VOPC_DPP8_Base<op, opName, ps.Pfl> { + // Note ps is the non-dpp pseudo + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; + let Constraints = ""; + let AsmMatchConverter = "cvtVOPCNoDstDPP8"; +} + +// VOPC64 + +class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P> + : VOP3_DPP_Base<OpName, P, 1>, VOP3_DPPe_Common<op, P> { + Instruction Opcode = !cast<Instruction>(NAME); + + bits<8> src0; + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + bit fi; + + let Inst{40-32} = 0xfa; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{80-72} = dpp_ctrl; + let Inst{82} = fi; + let Inst{83} = bound_ctrl; + // Inst{87-84} ignored by hw + let Inst{91-88} = bank_mask; + let Inst{95-92} = row_mask; + +} + +class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> + : VOPC64_DPP_Base<op, opName, ps.Pfl> { + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; + let Constraints = ps.Constraints; +} + +class VOPC64_DPP16_Dst<bits<10> op, VOP_DPP_Pseudo ps, + string opName = ps.OpName> + : VOPC64_DPP16<op, ps, opName> { + bits<8> sdst; + let Inst{7-0} = sdst; +} + +class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps, + string opName = ps.OpName> + : VOPC64_DPP16<op, ps, opName> { + let Inst{7-0} = ? ; + let AsmMatchConverter = "cvtVOPC64NoDstDPP"; +} + +class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P> + : VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P> { + Instruction Opcode = !cast<Instruction>(NAME); + + bits<8> src0; + bits<24> dpp8; + bits<9> fi; + + let Inst{40-32} = fi; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{95-72} = dpp8{23-0}; + +} + +class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> + : VOPC64_DPP8_Base<op, opName, ps.Pfl> { + // Note ps is the non-dpp pseudo + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let OtherPredicates = ps.OtherPredicates; +} + +class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> + : VOPC64_DPP8<op, ps, opName> { + bits<8> sdst; + let Inst{7-0} = sdst; + let Constraints = "$old = $sdst"; +} + +class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> + : VOPC64_DPP8<op, ps, opName> { + let Inst{7-0} = ? ; + let AsmMatchConverter = "cvtVOPC64NoDstDPP8"; + let Constraints = ""; +} + +//===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// GFX11. +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only in { + multiclass VOPC_Real_gfx11<bits<9> op> { + defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32"); + defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64"); + let DecoderNamespace = "GFX11" in { + def _e32_gfx11 : VOPC_Real<ps32, SIEncodingFamily.GFX11>, + VOPCe<op{7-0}>; + def _e64_gfx11 : VOP3_Real<ps64, SIEncodingFamily.GFX11>, + VOP3a_gfx11<{0, op}, ps64.Pfl> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX11" + + defm : VOPCInstAliases<NAME, "gfx11">; + + foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp"); + defvar AsmDPP = ps32.Pfl.AsmDPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, + SIEncodingFamily.GFX11>; + def _e32_dpp_w32_gfx11 : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64_gfx11 : VOPC_DPP16<op{7-0}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + defvar AsmDPP8 = ps32.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32>; + def _e32_dpp8_w32_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + } + foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); + defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>; + def _e64_dpp_w32_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> { + let AsmString = psDPP.OpName # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64>; + def _e64_dpp8_w32_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> { + let AsmString = ps32.OpName # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + } + + } + + multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName, + string asm_name> { + defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32"); + defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64"); + let DecoderNamespace = "GFX11" in { + def _e32_gfx11 : + // 32 and 64 bit forms of the instruction have _e32 and _e64 + // respectively appended to their assembly mnemonic. + // _e64 is printed as part of the VOPDstS64orS32 operand, whereas + // the destination-less 32bit forms add it to the asmString here. + VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name#"_e32">, + VOPCe<op{7-0}>, + MnemonicAlias<ps32.Mnemonic, asm_name>, Requires<[isGFX11Plus]>; + def _e64_gfx11 : + VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>, + VOP3a_gfx11<{0, op}, ps64.Pfl>, + MnemonicAlias<ps64.Mnemonic, asm_name>, Requires<[isGFX11Plus]> { + // Encoding used for VOPC instructions encoded as VOP3 differs from + // VOP3e by destination name (sdst) as VOPC doesn't have vector dst. + bits<8> sdst; + let Inst{7-0} = sdst; + } + } // End DecoderNamespace = "GFX11" + + defm : VOPCInstAliases<OpName, "gfx11", NAME>; + + foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp"); + defvar AsmDPP = ps32.Pfl.AsmDPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, + SIEncodingFamily.GFX11, asm_name>; + def _e32_dpp_w32_gfx11 + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp_w64_gfx11 + : VOPC_DPP16<op{7-0}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + defvar AsmDPP8 = ps32.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>; + def _e32_dpp8_w32_gfx11 + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e32_dpp8_w64_gfx11 + : VOPC_DPP8<op{7-0}, ps32, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + } + + foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); + defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>; + def _e64_dpp_w32_gfx11 + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp_w64_gfx11 + : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>; + def _e64_dpp8_w32_gfx11 + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc_lo, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave32; + } + def _e64_dpp8_w64_gfx11 + : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # " vcc, " # AsmDPP8; + let isAsmParserOnly = 1; + let WaveSizePredicate = isWave64; + } + } + } + + } + + multiclass VOPCX_Real_gfx11<bits<9> op> { + defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32"); + defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64"); + let DecoderNamespace = "GFX11" in { + def _e32_gfx11 : + VOPC_Real<ps32, SIEncodingFamily.GFX11>, + VOPCe<op{7-0}> { + let AsmString = !subst("_nosdst", "", ps32.PseudoInstr) + # " " # ps32.AsmOperands; + } + def _e64_gfx11 : + VOP3_Real<ps64, SIEncodingFamily.GFX11>, + VOP3a_gfx11<{0, op}, ps64.Pfl> { + let Inst{7-0} = ?; // sdst + let AsmString = !subst("_nosdst", "", ps64.Mnemonic) + # "{_e64} " # ps64.AsmOperands; + } + } // End DecoderNamespace = "GFX11" + + defm : VOPCXInstAliases<NAME, "gfx11">; + + foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp"); + defvar AsmDPP = ps32.Pfl.AsmDPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e32_dpp_gfx11 + : VOPC_DPP16_SIMC<op{7-0}, psDPP, SIEncodingFamily.GFX11> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP; + } + } + defvar AsmDPP8 = ps32.Pfl.AsmDPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32> { + let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8; + } + } + } + + foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp"); + defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e64_dpp_gfx11 + : VOPC64_DPP16_NoDst<{0, op}, psDPP>, + SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> { + let AsmString = !subst("_nosdst", "", psDPP.OpName) + # "{_e64_dpp} " # AsmDPP; + } + } + defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64> { + let AsmString = !subst("_nosdst", "", ps64.OpName) + # "{_e64_dpp} " # AsmDPP8; + } + } + } + } + + multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName, + string asm_name> { + defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32"); + defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64"); + let DecoderNamespace = "GFX11" in { + def _e32_gfx11 + : VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name>, + MnemonicAlias<!subst("_nosdst", "", ps32.Mnemonic), asm_name>, + Requires<[isGFX11Plus]>, + VOPCe<op{7-0}> { + let AsmString = asm_name # "{_e32} " # ps32.AsmOperands; + } + def _e64_gfx11 + : VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>, + MnemonicAlias<!subst("_nosdst", "", ps64.Mnemonic), asm_name>, + Requires<[isGFX11Plus]>, + VOP3a_gfx11<{0, op}, ps64.Pfl> { + let Inst{7-0} = ? ; // sdst + let AsmString = asm_name # "{_e64} " # ps64.AsmOperands; + } + } // End DecoderNamespace = "GFX11" + + defm : VOPCXInstAliases<OpName, "gfx11", NAME>; + + foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp"); + let DecoderNamespace = "DPPGFX11" in { + def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, + SIEncodingFamily.GFX11, asm_name>; + } + let DecoderNamespace = "DPP8GFX11" in { + def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>; + } + } + foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp"); + defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; + let DecoderNamespace = "DPPGFX11" in { + def _e64_dpp_gfx11 + : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>, + SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP; + } + } + defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8; + let DecoderNamespace = "DPP8GFX11" in { + def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> { + let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8; + } + } + } + + } +} // End AssemblerPredicate = isGFX11Only + +defm V_CMP_F_F16 : VOPC_Real_gfx11<0x000>; +defm V_CMP_LT_F16 : VOPC_Real_gfx11<0x001>; +defm V_CMP_EQ_F16 : VOPC_Real_gfx11<0x002>; +defm V_CMP_LE_F16 : VOPC_Real_gfx11<0x003>; +defm V_CMP_GT_F16 : VOPC_Real_gfx11<0x004>; +defm V_CMP_LG_F16 : VOPC_Real_gfx11<0x005>; +defm V_CMP_GE_F16 : VOPC_Real_gfx11<0x006>; +defm V_CMP_O_F16 : VOPC_Real_gfx11<0x007>; +defm V_CMP_U_F16 : VOPC_Real_gfx11<0x008>; +defm V_CMP_NGE_F16 : VOPC_Real_gfx11<0x009>; +defm V_CMP_NLG_F16 : VOPC_Real_gfx11<0x00a>; +defm V_CMP_NGT_F16 : VOPC_Real_gfx11<0x00b>; +defm V_CMP_NLE_F16 : VOPC_Real_gfx11<0x00c>; +defm V_CMP_NEQ_F16 : VOPC_Real_gfx11<0x00d>; +defm V_CMP_NLT_F16 : VOPC_Real_gfx11<0x00e>; +defm V_CMP_T_F16 : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16", "v_cmp_t_f16">; +defm V_CMP_F_F32 : VOPC_Real_gfx11<0x010>; +defm V_CMP_LT_F32 : VOPC_Real_gfx11<0x011>; +defm V_CMP_EQ_F32 : VOPC_Real_gfx11<0x012>; +defm V_CMP_LE_F32 : VOPC_Real_gfx11<0x013>; +defm V_CMP_GT_F32 : VOPC_Real_gfx11<0x014>; +defm V_CMP_LG_F32 : VOPC_Real_gfx11<0x015>; +defm V_CMP_GE_F32 : VOPC_Real_gfx11<0x016>; +defm V_CMP_O_F32 : VOPC_Real_gfx11<0x017>; +defm V_CMP_U_F32 : VOPC_Real_gfx11<0x018>; +defm V_CMP_NGE_F32 : VOPC_Real_gfx11<0x019>; +defm V_CMP_NLG_F32 : VOPC_Real_gfx11<0x01a>; +defm V_CMP_NGT_F32 : VOPC_Real_gfx11<0x01b>; +defm V_CMP_NLE_F32 : VOPC_Real_gfx11<0x01c>; +defm V_CMP_NEQ_F32 : VOPC_Real_gfx11<0x01d>; +defm V_CMP_NLT_F32 : VOPC_Real_gfx11<0x01e>; +defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">; +defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">; +defm V_CMP_LT_I16 : VOPC_Real_gfx11<0x031>; +defm V_CMP_EQ_I16 : VOPC_Real_gfx11<0x032>; +defm V_CMP_LE_I16 : VOPC_Real_gfx11<0x033>; +defm V_CMP_GT_I16 : VOPC_Real_gfx11<0x034>; +defm V_CMP_NE_I16 : VOPC_Real_gfx11<0x035>; +defm V_CMP_GE_I16 : VOPC_Real_gfx11<0x036>; +defm V_CMP_LT_U16 : VOPC_Real_gfx11<0x039>; +defm V_CMP_EQ_U16 : VOPC_Real_gfx11<0x03a>; +defm V_CMP_LE_U16 : VOPC_Real_gfx11<0x03b>; +defm V_CMP_GT_U16 : VOPC_Real_gfx11<0x03c>; +defm V_CMP_NE_U16 : VOPC_Real_gfx11<0x03d>; +defm V_CMP_GE_U16 : VOPC_Real_gfx11<0x03e>; +defm V_CMP_F_I32 : VOPC_Real_gfx11<0x040>; +defm V_CMP_LT_I32 : VOPC_Real_gfx11<0x041>; +defm V_CMP_EQ_I32 : VOPC_Real_gfx11<0x042>; +defm V_CMP_LE_I32 : VOPC_Real_gfx11<0x043>; +defm V_CMP_GT_I32 : VOPC_Real_gfx11<0x044>; +defm V_CMP_NE_I32 : VOPC_Real_gfx11<0x045>; +defm V_CMP_GE_I32 : VOPC_Real_gfx11<0x046>; +defm V_CMP_T_I32 : VOPC_Real_gfx11<0x047>; +defm V_CMP_F_U32 : VOPC_Real_gfx11<0x048>; +defm V_CMP_LT_U32 : VOPC_Real_gfx11<0x049>; +defm V_CMP_EQ_U32 : VOPC_Real_gfx11<0x04a>; +defm V_CMP_LE_U32 : VOPC_Real_gfx11<0x04b>; +defm V_CMP_GT_U32 : VOPC_Real_gfx11<0x04c>; +defm V_CMP_NE_U32 : VOPC_Real_gfx11<0x04d>; +defm V_CMP_GE_U32 : VOPC_Real_gfx11<0x04e>; +defm V_CMP_T_U32 : VOPC_Real_gfx11<0x04f>; + +defm V_CMP_F_I64 : VOPC_Real_gfx11<0x050>; +defm V_CMP_LT_I64 : VOPC_Real_gfx11<0x051>; +defm V_CMP_EQ_I64 : VOPC_Real_gfx11<0x052>; +defm V_CMP_LE_I64 : VOPC_Real_gfx11<0x053>; +defm V_CMP_GT_I64 : VOPC_Real_gfx11<0x054>; +defm V_CMP_NE_I64 : VOPC_Real_gfx11<0x055>; +defm V_CMP_GE_I64 : VOPC_Real_gfx11<0x056>; +defm V_CMP_T_I64 : VOPC_Real_gfx11<0x057>; +defm V_CMP_F_U64 : VOPC_Real_gfx11<0x058>; +defm V_CMP_LT_U64 : VOPC_Real_gfx11<0x059>; +defm V_CMP_EQ_U64 : VOPC_Real_gfx11<0x05a>; +defm V_CMP_LE_U64 : VOPC_Real_gfx11<0x05b>; +defm V_CMP_GT_U64 : VOPC_Real_gfx11<0x05c>; +defm V_CMP_NE_U64 : VOPC_Real_gfx11<0x05d>; +defm V_CMP_GE_U64 : VOPC_Real_gfx11<0x05e>; +defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>; + +defm V_CMP_CLASS_F16 : VOPC_Real_gfx11<0x07d>; +defm V_CMP_CLASS_F32 : VOPC_Real_gfx11<0x07e>; +defm V_CMP_CLASS_F64 : VOPC_Real_gfx11<0x07f>; + +defm V_CMPX_F_F16 : VOPCX_Real_gfx11<0x080>; +defm V_CMPX_LT_F16 : VOPCX_Real_gfx11<0x081>; +defm V_CMPX_EQ_F16 : VOPCX_Real_gfx11<0x082>; +defm V_CMPX_LE_F16 : VOPCX_Real_gfx11<0x083>; +defm V_CMPX_GT_F16 : VOPCX_Real_gfx11<0x084>; +defm V_CMPX_LG_F16 : VOPCX_Real_gfx11<0x085>; +defm V_CMPX_GE_F16 : VOPCX_Real_gfx11<0x086>; +defm V_CMPX_O_F16 : VOPCX_Real_gfx11<0x087>; +defm V_CMPX_U_F16 : VOPCX_Real_gfx11<0x088>; +defm V_CMPX_NGE_F16 : VOPCX_Real_gfx11<0x089>; +defm V_CMPX_NLG_F16 : VOPCX_Real_gfx11<0x08a>; +defm V_CMPX_NGT_F16 : VOPCX_Real_gfx11<0x08b>; +defm V_CMPX_NLE_F16 : VOPCX_Real_gfx11<0x08c>; +defm V_CMPX_NEQ_F16 : VOPCX_Real_gfx11<0x08d>; +defm V_CMPX_NLT_F16 : VOPCX_Real_gfx11<0x08e>; +defm V_CMPX_T_F16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16", "v_cmpx_t_f16">; +defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>; +defm V_CMPX_LT_F32 : VOPCX_Real_gfx11<0x091>; +defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11<0x092>; +defm V_CMPX_LE_F32 : VOPCX_Real_gfx11<0x093>; +defm V_CMPX_GT_F32 : VOPCX_Real_gfx11<0x094>; +defm V_CMPX_LG_F32 : VOPCX_Real_gfx11<0x095>; +defm V_CMPX_GE_F32 : VOPCX_Real_gfx11<0x096>; +defm V_CMPX_O_F32 : VOPCX_Real_gfx11<0x097>; +defm V_CMPX_U_F32 : VOPCX_Real_gfx11<0x098>; +defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11<0x099>; +defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11<0x09a>; +defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11<0x09b>; +defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11<0x09c>; +defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11<0x09d>; +defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11<0x09e>; +defm V_CMPX_T_F32 : VOPCX_Real_with_name_gfx11<0x09f, "V_CMPX_TRU_F32", "v_cmpx_t_f32">; + +defm V_CMPX_F_F64 : VOPCX_Real_gfx11<0x0a0>; +defm V_CMPX_LT_F64 : VOPCX_Real_gfx11<0x0a1>; +defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11<0x0a2>; +defm V_CMPX_LE_F64 : VOPCX_Real_gfx11<0x0a3>; +defm V_CMPX_GT_F64 : VOPCX_Real_gfx11<0x0a4>; +defm V_CMPX_LG_F64 : VOPCX_Real_gfx11<0x0a5>; +defm V_CMPX_GE_F64 : VOPCX_Real_gfx11<0x0a6>; +defm V_CMPX_O_F64 : VOPCX_Real_gfx11<0x0a7>; +defm V_CMPX_U_F64 : VOPCX_Real_gfx11<0x0a8>; +defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11<0x0a9>; +defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11<0x0aa>; +defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11<0x0ab>; +defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11<0x0ac>; +defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11<0x0ad>; +defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11<0x0ae>; +defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">; + +defm V_CMPX_LT_I16 : VOPCX_Real_gfx11<0x0b1>; +defm V_CMPX_EQ_I16 : VOPCX_Real_gfx11<0x0b2>; +defm V_CMPX_LE_I16 : VOPCX_Real_gfx11<0x0b3>; +defm V_CMPX_GT_I16 : VOPCX_Real_gfx11<0x0b4>; +defm V_CMPX_NE_I16 : VOPCX_Real_gfx11<0x0b5>; +defm V_CMPX_GE_I16 : VOPCX_Real_gfx11<0x0b6>; +defm V_CMPX_LT_U16 : VOPCX_Real_gfx11<0x0b9>; +defm V_CMPX_EQ_U16 : VOPCX_Real_gfx11<0x0ba>; +defm V_CMPX_LE_U16 : VOPCX_Real_gfx11<0x0bb>; +defm V_CMPX_GT_U16 : VOPCX_Real_gfx11<0x0bc>; +defm V_CMPX_NE_U16 : VOPCX_Real_gfx11<0x0bd>; +defm V_CMPX_GE_U16 : VOPCX_Real_gfx11<0x0be>; +defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>; +defm V_CMPX_LT_I32 : VOPCX_Real_gfx11<0x0c1>; +defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11<0x0c2>; +defm V_CMPX_LE_I32 : VOPCX_Real_gfx11<0x0c3>; +defm V_CMPX_GT_I32 : VOPCX_Real_gfx11<0x0c4>; +defm V_CMPX_NE_I32 : VOPCX_Real_gfx11<0x0c5>; +defm V_CMPX_GE_I32 : VOPCX_Real_gfx11<0x0c6>; +defm V_CMPX_T_I32 : VOPCX_Real_gfx11<0x0c7>; +defm V_CMPX_F_U32 : VOPCX_Real_gfx11<0x0c8>; +defm V_CMPX_LT_U32 : VOPCX_Real_gfx11<0x0c9>; +defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11<0x0ca>; +defm V_CMPX_LE_U32 : VOPCX_Real_gfx11<0x0cb>; +defm V_CMPX_GT_U32 : VOPCX_Real_gfx11<0x0cc>; +defm V_CMPX_NE_U32 : VOPCX_Real_gfx11<0x0cd>; +defm V_CMPX_GE_U32 : VOPCX_Real_gfx11<0x0ce>; +defm V_CMPX_T_U32 : VOPCX_Real_gfx11<0x0cf>; + +defm V_CMPX_F_I64 : VOPCX_Real_gfx11<0x0d0>; +defm V_CMPX_LT_I64 : VOPCX_Real_gfx11<0x0d1>; +defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11<0x0d2>; +defm V_CMPX_LE_I64 : VOPCX_Real_gfx11<0x0d3>; +defm V_CMPX_GT_I64 : VOPCX_Real_gfx11<0x0d4>; +defm V_CMPX_NE_I64 : VOPCX_Real_gfx11<0x0d5>; +defm V_CMPX_GE_I64 : VOPCX_Real_gfx11<0x0d6>; +defm V_CMPX_T_I64 : VOPCX_Real_gfx11<0x0d7>; +defm V_CMPX_F_U64 : VOPCX_Real_gfx11<0x0d8>; +defm V_CMPX_LT_U64 : VOPCX_Real_gfx11<0x0d9>; +defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11<0x0da>; +defm V_CMPX_LE_U64 : VOPCX_Real_gfx11<0x0db>; +defm V_CMPX_GT_U64 : VOPCX_Real_gfx11<0x0dc>; +defm V_CMPX_NE_U64 : VOPCX_Real_gfx11<0x0dd>; +defm V_CMPX_GE_U64 : VOPCX_Real_gfx11<0x0de>; +defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>; +defm V_CMPX_CLASS_F16 : VOPCX_Real_gfx11<0x0fd>; +defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11<0x0fe>; +defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11<0x0ff>; + +//===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// -let AssemblerPredicate = isGFX10Plus in { +let AssemblerPredicate = isGFX10Only in { multiclass VOPC_Real_gfx10<bits<9> op> { let DecoderNamespace = "GFX10" in { def _e32_gfx10 : @@ -931,7 +1722,7 @@ let AssemblerPredicate = isGFX10Plus in { defm : VOPCXInstAliases<NAME, "gfx10">; } -} // End AssemblerPredicate = isGFX10Plus +} // End AssemblerPredicate = isGFX10Only defm V_CMP_LT_I16 : VOPC_Real_gfx10<0x089>; defm V_CMP_EQ_I16 : VOPC_Real_gfx10<0x08a>; @@ -1025,6 +1816,12 @@ multiclass VOPCX_Real_gfx6_gfx7<bits<9> op> : multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> : VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>; +multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> : + VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_gfx11<op>; + +multiclass VOPCX_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> : + VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real_gfx11<op>; + defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>; defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>; defm V_CMP_EQ_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x002>; @@ -1057,21 +1854,21 @@ defm V_CMPX_NLE_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01c>; defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>; defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>; defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>; -defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x020>; -defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x021>; -defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x022>; -defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x023>; -defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x024>; -defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x025>; -defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x026>; -defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x027>; -defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x028>; -defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x029>; -defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02a>; -defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02b>; -defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02c>; -defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02d>; -defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02e>; +defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x020>; +defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x021>; +defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x022>; +defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x023>; +defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x024>; +defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x025>; +defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x026>; +defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x027>; +defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x028>; +defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x029>; +defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02a>; +defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02b>; +defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02c>; +defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02d>; +defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02e>; defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>; defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>; defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>; diff --git a/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/llvm/lib/Target/AMDGPU/VOPDInstructions.td new file mode 100644 index 000000000000..420f18436095 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/VOPDInstructions.td @@ -0,0 +1,159 @@ +//===-- VOPDInstructions.td - Vector Instruction Definitions --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Encodings +//===----------------------------------------------------------------------===// + +class VOPDe<bits<4> opX, bits<5> opY> : Enc64 { + bits<9> src0X; + bits<8> vsrc1X; + bits<8> vdstX; + bits<9> src0Y; + bits<8> vsrc1Y; + bits<8> vdstY; + + let Inst{8-0} = src0X; + let Inst{16-9} = vsrc1X; + let Inst{21-17} = opY; + let Inst{25-22} = opX; + let Inst{31-26} = 0x32; // encoding + let Inst{40-32} = src0Y; + let Inst{48-41} = vsrc1Y; + let Inst{55-49} = vdstY{7-1}; + let Inst{63-56} = vdstX; +} + +class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 { + bits<9> src0X; + bits<8> vsrc1X; + bits<8> vdstX; + bits<9> src0Y; + bits<8> vsrc1Y; + bits<8> vdstY; + bits<32> imm; + + let Inst{8-0} = src0X; + let Inst{16-9} = vsrc1X; + let Inst{21-17} = opY; + let Inst{25-22} = opX; + let Inst{31-26} = 0x32; // encoding + let Inst{40-32} = src0Y; + let Inst{48-41} = vsrc1Y; + let Inst{55-49} = vdstY{7-1}; + let Inst{63-56} = vdstX; + let Inst{95-64} = imm; +} + +//===----------------------------------------------------------------------===// +// VOPD classes +//===----------------------------------------------------------------------===// + +class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, + VOPD_Component XasVC, VOPD_Component YasVC> + : VOPAnyCommon<outs, ins, asm, []>, + VOP<NAME>, + SIMCInstr<NAME, SIEncodingFamily.GFX11> { + // Fields for table indexing + Instruction Opcode = !cast<Instruction>(NAME); + bits<5> OpX = XasVC.VOPDOp; + bits<5> OpY = YasVC.VOPDOp; + + let VALU = 1; + + let DecoderNamespace = "GFX11"; + let AssemblerPredicate = isGFX11Plus; + let WaveSizePredicate = isWave32; + let isCodeGenOnly = 0; + let SubtargetPredicate = isGFX11Plus; + let AsmMatchConverter = "cvtVOPD"; + let Size = 8; + let ReadsModeReg = !or(VDX.ReadsModeReg, VDY.ReadsModeReg); + let mayRaiseFPException = ReadsModeReg; + + let Uses = RegListUnion<VDX.Uses, VDY.Uses>.ret; + let Defs = RegListUnion<VDX.Defs, VDY.Defs>.ret; + let SchedRW = !listconcat(VDX.SchedRW, VDY.SchedRW); +} + +class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, + VOPD_Component XasVC, VOPD_Component YasVC> + : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>, + VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> { + let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X); + let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y); +} + +class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY, + VOPD_Component XasVC, VOPD_Component YasVC> + : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>, + VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> { + let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X); + let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y); + let Size = 12; +} + +// V_DUAL_DOT2ACC_F32_BF16 is a legal instruction, but V_DOT2ACC_F32_BF16 is +// not. Since we generate the DUAL form by converting from the normal form we +// will never generate it. +defvar VOPDYPseudos = [ + "V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32", + "V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32", + "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32", + "V_DOT2C_F32_F16_e32", "V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_AND_B32_e32" +]; +defvar VOPDXPseudos = VOPDYPseudos[0...VOPDX_Max_Index]; + +def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> { + let DecoderMethod = "decodeOperandVOPDDstY"; +} + +foreach x = VOPDXPseudos in { + foreach y = VOPDYPseudos in { + defvar xInst = !cast<VOP_Pseudo>(x); + defvar yInst = !cast<VOP_Pseudo>(y); + defvar XasVC = !cast<VOPD_Component>(x); + defvar YasVC = !cast<VOPD_Component>(y); + defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"), + !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); + // If X or Y is MADK (have a mandatory immediate), all src operands which + // may contain an optional literal must use the VSrc_*_Deferred operand + // type. Optional literal operands in MADK VOPD components always use this + // operand form. If Both X and Y are MADK, the mandatory literal of X + // additionally must use an alternate operand format which defers to the + // 'real' Y literal + defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32")); + defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32")); + defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2); + defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY); + if !or(isOpXMADK, isOpYMADK) then { + if !and(isOpXMADK, isOpYMADK) then { + defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl); + defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY); + defvar asm = XasVC.VOPDName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; + } else { + defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; + if isOpXMADK then { + assert !not(isOpYMADK), "Expected only OpX as MADK"; + defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred); + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; + } else { + assert !not(isOpXMADK), "Expected only OpY as MADK"; + defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY); + def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>; + } + } + } else { + defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY); + defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY; + def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC>; + } + } +} + diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a8368892c565..8cd3d2fe2c47 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -30,6 +30,16 @@ class VOP <string opName> { string OpName = opName; } +// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted +defvar VOPDX_Max_Index = 12; + +class VOPD_Component<bits<5> OpIn, string vOPDName> { + Instruction BaseVOP = !cast<Instruction>(NAME); + string VOPDName = "v_dual_" # !substr(vOPDName, 2); + bits<5> VOPDOp = OpIn; + bit CanBeVOPDX = !le(VOPDOp, VOPDX_Max_Index); +} + class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { @@ -92,6 +102,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let VOP3_OPSEL = isVop3OpSel; let IsPacked = P.IsPacked; let IsMAI = P.IsMAI; + let IsWMMA = P.IsWMMA; let AsmOperands = !if(isVop3OpSel, P.AsmVOP3OpSel, @@ -144,9 +155,9 @@ class VOP_Real<VOP_Pseudo ps> { bit IsSingle = ps.Pfl.IsSingle; } -class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : +class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : VOP_Real <ps>, - InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + InstSI <ps.OutOperandList, ps.InOperandList, asm_name # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { let VALU = 1; @@ -155,9 +166,6 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : let isCodeGenOnly = 0; let UseNamedOperandTable = 1; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; - // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let OtherPredicates = ps.OtherPredicates; @@ -179,8 +187,12 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : // XXX - Is there any reason to distinguish this from regular VOP3 // here? -class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> : - VOP3_Real<ps, EncodingFamily>; +class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> : + VOP3_Real<ps, EncodingFamily, asm_name> { + + // The v_wmma pseudos have extra constraints that we do not want to impose on the real instruction. + let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints); +} class VOP3a<VOPProfile P> : Enc64 { bits<4> src0_modifiers; @@ -217,6 +229,8 @@ class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> { let Inst{31-26} = 0x35; } +class VOP3a_gfx11<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>; + class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> { let Inst{25-16} = op; let Inst{15} = !if(P.HasClamp, clamp{0}, 0); @@ -232,6 +246,8 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> { let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0); } +class VOP3e_gfx11<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>; + class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> { bits<8> vdst; let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); @@ -251,6 +267,9 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0); } +class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>; + + // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { bits<2> attrchan; @@ -285,6 +304,8 @@ class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { let Inst{62} = !if(p.HasSrc0Mods, src0_modifiers{0}, 0); } +class VOP3Interp_gfx11<bits<10> op, VOPProfile p> : VOP3Interp_gfx10<op, p>; + class VOP3be <VOPProfile P> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -310,7 +331,6 @@ class VOP3be <VOPProfile P> : Enc64 { class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 { bits<8> vdst; - // neg, neg_hi, op_sel put in srcN_modifiers bits<4> src0_modifiers; bits<9> src0; bits<4> src1_modifiers; @@ -372,11 +392,42 @@ class VOP3Pe_MAI <bits<7> op, VOPProfile P, bit acc_cd = 0> : Enc64 { let Inst{63-61} = !if(P.HasSrc1, blgp, 0); } +class VOP3Pe_SMFMAC <bits<7> op> : Enc64 { + bits<10> vdst; // VGPR or AGPR, but not SGPR. vdst{8} is not encoded in the instruction. + bits<10> src0; + bits<10> src1; + bits<9> idx; + bits<3> blgp; + bits<3> cbsz; + bits<4> abid; + + let blgp = 0; + + let Inst{7-0} = vdst{7-0}; + + let Inst{10-8} = cbsz; + let Inst{14-11} = abid; + + let Inst{15} = vdst{9}; // acc(vdst) + + let Inst{22-16} = op; + let Inst{31-23} = 0x1a7; // encoding + let Inst{40-32} = src0{8-0}; + let Inst{49-41} = src1{8-0}; + let Inst{58-50} = idx; + + let Inst{59} = src0{9}; // acc(0) + let Inst{60} = src1{9}; // acc(1) + + let Inst{63-61} = blgp; +} class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> { let Inst{31-23} = 0x198; //encoding } +class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>; + class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> { let Inst{25-17} = op; } @@ -388,6 +439,8 @@ class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> { let Inst{31-26} = 0x35; } +class VOP3be_gfx11<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>; + class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> { bits<1> clamp; let Inst{25-16} = op; @@ -621,8 +674,89 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 { let Inst{63-60} = row_mask; } -class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : - InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, +class VOP3_DPPe_Fields_Base { + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + bit fi; +} +class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base { + bits<8> src0; +} + +// Common refers to common between DPP and DPP8 +class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 { + bits<4> src0_modifiers; + bits<3> src1_modifiers; + bits<3> src2_modifiers; + bits<1> clamp; + bits<2> omod; + + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); + // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. + let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?); + let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?); + let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?); + let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?); + let Inst{15} = !if(P.HasClamp, clamp, 0); + let Inst{25-16} = op; + let Inst{31-26} = 0x35; + + let Inst{60-59} = !if(P.HasOMod, omod, 0); + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); +} + +class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> { + bits<8> vdst; + bits<9> src1; + bits<9> src2; + + let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); +} + +class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 { + bits<4> src0_modifiers; + bits<4> src1_modifiers; + bits<4> src2_modifiers; + bits<1> clamp; + + let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 + let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 + let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 + let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) + let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) + let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2) + let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + let Inst{22-16} = op; + let Inst{31-23} = 0x198; // encoding + let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0) + let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1) + let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo) + let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo) + let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) +} + +class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> { + bits<8> vdst; + bits<9> src1; + bits<9> src2; + + let Inst{7-0} = vdst; + let Inst{49-41} = !if(P.HasSrc1, src1, 0); + let Inst{58-50} = !if(P.HasSrc2, src2, 0); +} + +class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[], + dag Ins = P.InsDPP, string asmOps = P.AsmDPP> : + InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>, VOP <OpName>, SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> { @@ -645,7 +779,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let isConvergent = 1; string Mnemonic = OpName; - string AsmOperands = P.AsmDPP; + string AsmOperands = asmOps; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); @@ -659,6 +793,17 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : VOPProfile Pfl = P; } +class VOP3_DPP_Pseudo <string OpName, VOPProfile P> : + VOP_DPP_Pseudo <OpName, P, [], P.InsVOP3DPP, P.AsmVOP3DPP> { + let PseudoInstr = OpName#"_e64"#"_dpp"; + let OutOperandList = P.OutsVOP3DPP; + let Size = 12; + let VOP3 = 1; + let AsmMatchConverter = "cvtVOP3DPP"; + let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP, + AMDGPUAsmVariants.Disable); +} + class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { @@ -679,6 +824,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let isConvergent = ps.isConvergent; let SubtargetPredicate = ps.SubtargetPredicate; let AssemblerPredicate = ps.AssemblerPredicate; + let OtherPredicates = ps.OtherPredicates; let AsmMatchConverter = ps.AsmMatchConverter; let AsmVariantName = ps.AsmVariantName; let UseNamedOperandTable = ps.UseNamedOperandTable; @@ -692,11 +838,10 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : let TRANS = ps.TRANS; } -class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, - dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP), - string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> : - InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []>, - VOP_DPPe<P, IsDPP16> { +class VOP_DPP_Base <string OpName, VOPProfile P, + dag InsDPP, + string AsmDPP > : + InstSI <P.OutsDPP, InsDPP, OpName#AsmDPP, []> { let mayLoad = 0; let mayStore = 0; @@ -717,6 +862,59 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, let DecoderNamespace = "DPP"; } +class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsDPP16, P.InsDPP), + string AsmDPP = !if(IsDPP16, P.AsmDPP16, P.AsmDPP)> : + VOP_DPP_Base<OpName, P, InsDPP, AsmDPP>, VOP_DPPe<P, IsDPP16>; + +class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), + string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : + VOP_DPP_Base<OpName, P, InsDPP, AsmDPP> { + let OutOperandList = P.OutsVOP3DPP; + let AsmMatchConverter = "cvtVOP3DPP"; + let VOP3 = 1; + let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP, + AMDGPUAsmVariants.Disable); + let Size = 12; +} + +class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), + string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : + VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>, + VOP3_DPPe_Fields { + + let Inst{40-32} = 0xfa; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{80-72} = dpp_ctrl; + let Inst{82} = !if(IsDPP16, fi, ?); + let Inst{83} = bound_ctrl; + + // Inst{87-84} ignored by hw + let Inst{91-88} = bank_mask; + let Inst{95-92} = row_mask; +} + +class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16, + dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP), + string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> : + VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3P_DPPe_Common<op, P>, + VOP3_DPPe_Fields { + + let VOP3P = 1; + + let Inst{40-32} = 0xfa; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{80-72} = dpp_ctrl; + let Inst{82} = !if(IsDPP16, fi, ?); + let Inst{83} = bound_ctrl; + + // Inst{87-84} ignored by hw + let Inst{91-88} = bank_mask; + let Inst{95-92} = row_mask; +} + class VOP_DPP8e<VOPProfile P> : Enc64 { bits<8> src0; bits<24> dpp8; @@ -726,9 +924,14 @@ class VOP_DPP8e<VOPProfile P> : Enc64 { let Inst{63-40} = dpp8{23-0}; } -class VOP_DPP8<string OpName, VOPProfile P> : - InstSI<P.OutsDPP8, P.InsDPP8, OpName#P.AsmDPP8, []>, - VOP_DPP8e<P> { +class VOP3_DPP8e_Fields { + bits<8> src0; + bits<24> dpp8; + bits<9> fi; +} + +class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> : + InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> { let mayLoad = 0; let mayStore = 0; @@ -742,12 +945,44 @@ class VOP_DPP8<string OpName, VOPProfile P> : let AsmMatchConverter = "cvtDPP8"; let SubtargetPredicate = HasDPP8; let AssemblerPredicate = HasDPP8; - let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, - AMDGPUAsmVariants.Disable); + let AsmVariantName = AMDGPUAsmVariants.DPP; let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); } +class VOP_DPP8<string OpName, VOPProfile P> : + VOP_DPP8_Base<OpName, P>, VOP_DPP8e<P>; + +class VOP3_DPP8_Base<string OpName, VOPProfile P> : + VOP_DPP8_Base<OpName, P, P.InsVOP3DPP8, P.AsmVOP3DPP8> { + let OutOperandList = P.OutsVOP3DPP8; + let AsmMatchConverter = "cvtVOP3DPP8"; + let AsmVariantName = !if(P.HasExtVOP3DPP, AMDGPUAsmVariants.VOP3_DPP, + AMDGPUAsmVariants.Disable); + let VOP3 = 1; + let Size = 12; +} + + +class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> : + VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>, + VOP3_DPP8e_Fields { + + let Inst{40-32} = fi; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{95-72} = dpp8{23-0}; +} + +class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> : + VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>, + VOP3_DPP8e_Fields { + + let VOP3P = 1; + let Inst{40-32} = fi; + let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{95-72} = dpp8{23-0}; +} + def DPP8Mode { int FI_0 = 0xE9; int FI_1 = 0xEA; @@ -780,14 +1015,12 @@ class getDivergentFrag<SDPatternOperator Op> { } class VOPPatGen<SDPatternOperator Op, VOPProfile P> { - PatFrag Operator = getDivergentFrag < Op >.ret; dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator, !subst(P.Src0RC32, P.Src0VT, !subst(P.Src1RC32, P.Src1VT, tmp)))); - dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set, !subst(P.DstRC, P.DstVT, tmp))); @@ -827,12 +1060,379 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> : DSTCLAMP.ENABLE) >; +//===----------------------------------------------------------------------===// +// VOP3 Classes +//===----------------------------------------------------------------------===// + +class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { + dag src0 = !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + + list<dag> ret3 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp, + bit IsDOT = 0, + ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, VOP3PMods)> { + dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers)); + dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers)); + dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers)); + dag clamp_dag = (i1 timm:$clamp); + + list<dag> ret3 = [(set P.DstVT:$vdst, + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag)))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers), + (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3FromVOP2Pat<VOPProfile P, SDPatternOperator node> { + list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; +} +// In VOP1, we can have clamp and omod even if !HasModifiers +class getVOP3Pat<VOPProfile P, SDPatternOperator node> { + dag src0 = + !if(P.HasOMod, + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$omod)), // impossible? + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0)) + ); + list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1, P.Src2VT:$src2))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), P.Src1VT:$src1))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))]; + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))]; + list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))]; + list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))]; + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret = !if(!eq(P.Src0VT, P.Src1VT), + // mfma + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, + timm:$cbsz, timm:$abid, timm:$blgp))], + // smfmac + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx, + timm:$cbsz, timm:$abid))]); +} + +class VOP3Features<bit Clamp, bit OpSel, bit Packed, bit MAI> { + bit HasClamp = Clamp; + bit HasOpSel = OpSel; + bit IsPacked = Packed; + bit IsMAI = MAI; +} + +def VOP3_REGULAR : VOP3Features<0, 0, 0, 0>; +def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>; +def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; +def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; +def VOP3_MAI : VOP3Features<0, 0, 0, 1>; + +class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> { + + let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); + let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); + let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); + + let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); +} + +class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> { + let IsSingle = 1; + +} + +// consistently gives instructions a _e64 suffix +multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> { + def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>; +} + +class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit IsVOP2 = 0> : + VOP3_Pseudo<OpName, P, + !if(P.HasOpSel, + !if(P.HasModifiers, + getVOP3OpSelModPat<P, node>.ret, + getVOP3OpSelPat<P, node>.ret), + !if(P.HasModifiers, + getVOP3ModPat<P, node>.ret, + !if(IsVOP2, + getVOP3FromVOP2Pat<P, node>.ret, + !if(P.HasIntClamp, + getVOP3ClampPat<P, node>.ret, + !if (P.IsMAI, + getVOP3MAIPat<P, node>.ret, + getVOP3Pat<P, node>.ret))))), + 0, P.HasOpSel> { + + let IntClamp = P.HasIntClamp; + let AsmMatchConverter = + !if(P.HasOpSel, + "cvtVOP3OpSel", + !if(!or(P.HasModifiers, P.HasOMod, P.HasIntClamp), + "cvtVOP3", + "")); +} + +multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> { + def _e64 : VOP3InstBase<OpName, P, node>; + let SubtargetPredicate = isGFX11Plus in { + foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + def _e64_dpp : VOP3_DPP_Pseudo <OpName, P>; + } // end SubtargetPredicate = isGFX11Plus +} + +//===----------------------------------------------------------------------===// +// VOP3 DPP +//===----------------------------------------------------------------------===// + +class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> + : VOP3_DPP<op, opName, ps.Pfl, 1> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + let AssemblerPredicate = HasDPP16; + let SubtargetPredicate = HasDPP16; + let OtherPredicates = ps.OtherPredicates; +} + +class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget, + string opName = ps.OpName> + : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>; + +class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> + : VOP3_DPP8<op, opName, ps.Pfl> { + let hasSideEffects = ps.hasSideEffects; + let Defs = ps.Defs; + let SchedRW = ps.SchedRW; + let Uses = ps.Uses; + + let OtherPredicates = ps.OtherPredicates; +} + +class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps, + string opName = ps.OpName> + : Base_VOP3_DPP16<op, ps, opName> { + bits<7> sdst; + let Inst{14 - 8} = sdst; +} + +class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> + : Base_VOP3_DPP8<op, ps, opName> { + bits<7> sdst; + let Inst{14 - 8} = sdst; +} + +//===----------------------------------------------------------------------===// +// VOP3 GFX11 +//===----------------------------------------------------------------------===// + +let AssemblerPredicate = isGFX11Only, + DecoderNamespace = "GFX11" in { + multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME, + bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3OpSel_gfx11<op, ps.Pfl>; + foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3e_gfx11<op, ps.Pfl>; + } + } + multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName, + string asmName, bit isSingle = 0> { + defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.AsmOperands, + IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3OpSel_gfx11<op, ps.Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3e_gfx11<op, ps.Pfl>, + MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>; + } + } + // for READLANE/WRITELANE + multiclass VOP3_Real_No_Suffix_gfx11<bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP_Pseudo>(opName); + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11>, + VOP3e_gfx11<op, ps.Pfl>; + } + multiclass VOP3_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> { + def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> { + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.Pfl.AsmVOP3DPP16, DecoderNamespace = "DPPGFX11" in { + defm NAME : VOP3_Real_dpp_Base_gfx11<op, opName>; + } + } + multiclass VOP3_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> { + let DecoderNamespace = "DPP8GFX11"; + } + } + multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName, + string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, DecoderNamespace = "DPP8GFX11" in { + defm NAME : VOP3_Real_dpp8_Base_gfx11<op, opName>; + } + } + multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName, + bit isSingle = 0> { + defvar ps = !cast<VOP3_Pseudo>(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in + def _e64_gfx11 : + VOP3_Real<ps, SIEncodingFamily.GFX11, asmName>, + VOP3be_gfx11<op, ps.Pfl> ; + } + multiclass VOP3be_Real_dpp_gfx11<bits<10> op, string opName, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); + defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp"); + def _e64_dpp_gfx11 : Base_VOP3b_DPP16<op, dpp_ps, asmName>, + SIMCInstr<dpp_ps.PseudoInstr, SIEncodingFamily.GFX11> { + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP3be_Real_dpp8_gfx11<bits<10> op, string opName, string asmName> { + defvar ps = !cast<VOP3_Pseudo>(opName #"_e64"); + def _e64_dpp8_gfx11 : VOP3b_DPP8_Base<op, ps, asmName> { + let DecoderNamespace = "DPP8GFX11"; + } + } +} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" + +// VOP1 and VOP2 depend on these triple defs +multiclass VOP3_Realtriple_gfx11<bits<10> op, + bit isSingle = 0, string opName = NAME> : + VOP3_Real_Base_gfx11<op, opName, isSingle>, + VOP3_Real_dpp_Base_gfx11<op, opName>, + VOP3_Real_dpp8_Base_gfx11<op, opName>; + +multiclass VOP3Only_Realtriple_gfx11<bits<10> op> : + VOP3_Realtriple_gfx11<op, 1>; + +multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName, + string asmName, bit isSingle = 0> : + VOP3_Real_with_name_gfx11<op, opName, asmName, isSingle>, + VOP3_Real_dpp_with_name_gfx11<op, opName, asmName>, + VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>; + +multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName, + string asmName> : + VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>; + +multiclass VOP3be_Realtriple_gfx11< + bits<10> op, bit isSingle = 0, string opName = NAME, + string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> : + VOP3be_Real_gfx11<op, opName, asmName, isSingle>, + VOP3be_Real_dpp_gfx11<op, opName, asmName>, + VOP3be_Real_dpp8_gfx11<op, opName, asmName>; + +multiclass VOP3beOnly_Realtriple_gfx11<bits<10> op> : + VOP3be_Realtriple_gfx11<op, 1>; include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" include "VOP3Instructions.td" include "VOP3PInstructions.td" +include "VOPDInstructions.td" class VOPInfoTable <string Format> : GenericTable { @@ -847,3 +1447,15 @@ class VOPInfoTable <string Format> : GenericTable { def VOP1InfoTable : VOPInfoTable<"VOP1">; def VOP2InfoTable : VOPInfoTable<"VOP2">; def VOP3InfoTable : VOPInfoTable<"VOP3">; + +class VOPC64Table <string Format> : GenericTable { + let FilterClass = "VOPC64_" # Format # "_Base"; + let CppTypeName = "VOPC64DPPInfo"; + let Fields = ["Opcode"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "isVOPC64" # Format # "OpcodeHelper"; +} + +def VOPC64DPPTable : VOPC64Table<"DPP">; +def VOPC64DPP8Table : VOPC64Table<"DPP8">; |
