diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AMDGPU | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
172 files changed, 19350 insertions, 8444 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index fbed51de0ea49..88c79665be60d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -10,15 +10,16 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H -#include "llvm/Target/TargetMachine.h" #include "llvm/IR/IntrinsicsR600.h" // TODO: Sink this. #include "llvm/IR/IntrinsicsAMDGPU.h" // TODO: Sink this. +#include "llvm/Support/CodeGen.h" namespace llvm { class AMDGPUTargetMachine; class FunctionPass; class GCNTargetMachine; +class ImmutablePass; class ModulePass; class Pass; class Target; @@ -27,6 +28,14 @@ class TargetOptions; class PassRegistry; class Module; +// GlobalISel passes +void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &); +FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone); +void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &); +FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone); +FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone); +void initializeAMDGPURegBankCombinerPass(PassRegistry &); + // R600 Passes FunctionPass *createR600VectorRegMerger(); FunctionPass *createR600ExpandSpecialInstrsPass(); @@ -55,8 +64,9 @@ FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, - const TargetMachine *); + +FunctionPass *createSIPostRABundlerPass(); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); @@ -159,6 +169,9 @@ extern char &SILowerControlFlowID; void initializeSIRemoveShortExecBranchesPass(PassRegistry &); extern char &SIRemoveShortExecBranchesID; +void initializeSIPreEmitPeepholePass(PassRegistry &); +extern char &SIPreEmitPeepholeID; + void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; @@ -185,6 +198,10 @@ FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; +FunctionPass *createAMDGPUPromoteAllocaToVector(); +void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&); +extern char &AMDGPUPromoteAllocaToVectorID; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag( TargetMachine *TM = nullptr, @@ -219,12 +236,18 @@ extern char &SIMemoryLegalizerID; void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; +void initializeSIInsertHardClausesPass(PassRegistry &); +extern char &SIInsertHardClausesID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; void initializeSIFormMemoryClausesPass(PassRegistry&); extern char &SIFormMemoryClausesID; +void initializeSIPostRABundlerPass(PassRegistry&); +extern char &SIPostRABundlerID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 42b477e07b3b7..e32f0fcc47713 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -33,6 +33,12 @@ def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", "Assuming f32 fma is at least as fast as mul + add" >; +def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32", + "FastDenormalF32", + "true", + "Enabling denormals does not cause f32 instructions to run at f64 rates" +>; + def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128", "MIMG_R128", "true", @@ -254,6 +260,12 @@ def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "Additional instructions for GFX10+" >; +def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts", + "GFX10_3Insts", + "true", + "Additional instructions for GFX10.3" +>; + def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", "GFX7GFX8GFX9Insts", "true", @@ -360,7 +372,19 @@ def FeatureDPP8 : SubtargetFeature<"dpp8", def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", - "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" + "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128" +>; + +def FeatureGFX10A16 : SubtargetFeature<"a16", + "HasGFX10A16", + "true", + "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands" +>; + +def FeatureG16 : SubtargetFeature<"g16", + "HasG16", + "true", + "Support G16 for 16-bit gradient image operands" >; def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", @@ -369,6 +393,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "Support NSA encoding for image instructions" >; +def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding", + "GFX10_BEncoding", + "true", + "Encoding format GFX10_B" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -439,7 +469,8 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts", "HasAtomicFaddInsts", "true", "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, " - "global_atomic_pk_add_f16 instructions" + "global_atomic_pk_add_f16 instructions", + [FeatureFlatGlobalInsts] >; def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support", @@ -466,6 +497,30 @@ def FeatureVscnt : SubtargetFeature<"vscnt", "Has separate store vscnt counter" >; +def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst", + "HasGetWaveIdInst", + "true", + "Has s_get_waveid_in_workgroup instruction" +>; + +def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst", + "HasSMemTimeInst", + "true", + "Has s_memtime instruction" +>; + +def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", + "HasMadMacF32Insts", + "true", + "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions" +>; + +def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts", + "HasDsSrc2Insts", + "true", + "Has ds_*_src2 instructions" +>; + def FeatureRegisterBanking : SubtargetFeature<"register-banking", "HasRegisterBanking", "true", @@ -488,36 +543,6 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", // Subtarget Features (options and debugging) //===------------------------------------------------------------===// -// Denormal handling for fp64 and fp16 is controlled by the same -// config register when fp16 supported. -// TODO: Do we need a separate f16 setting when not legal? -def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals", - "FP64FP16Denormals", - "true", - "Enable double and half precision denormal handling", - [FeatureFP64] ->; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64FP16Denormals", - "true", - "Enable double and half precision denormal handling", - [FeatureFP64, FeatureFP64FP16Denormals] ->; - -def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", - "FP64FP16Denormals", - "true", - "Enable half precision denormal handling", - [FeatureFP64FP16Denormals] ->; - -def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", - "FPExceptions", - "true", - "Enable floating point exceptions" ->; - class FeatureMaxPrivateElementSize<int size> : SubtargetFeature< "max-private-element-size-"#size, "MaxPrivateElementSize", @@ -628,9 +653,10 @@ class GCNSubtargetFeatureGeneration <string Value, def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", "southern-islands", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, - FeatureWavefrontSize64, - FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, - FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK] + FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, + FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, + FeatureDoesNotSupportXNACK] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -638,7 +664,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, - FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC] + FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -649,8 +676,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts + FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, + FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32 ] >; @@ -665,7 +693,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, - FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, + FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, + FeatureFastDenormalF32 ] >; @@ -682,7 +712,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, FeatureVOP3Literal, FeatureDPP8, - FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC + FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC, + FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16 ] >; @@ -853,6 +884,10 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; @@ -871,6 +906,10 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; @@ -888,10 +927,29 @@ def FeatureISAVersion10_1_2 : FeatureSet< FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; +def FeatureISAVersion10_3_0 : FeatureSet< + [FeatureGFX10, + FeatureGFX10_BEncoding, + FeatureGFX10_3Insts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3]>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -973,190 +1031,222 @@ def NullALU : InstrItinClass; def isGFX6 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureSouthernIslands">; + AssemblerPredicate<(all_of FeatureSouthernIslands)>; def isGFX6GFX7 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX10Insts))>; def isGFX6GFX7GFX10 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, - AssemblerPredicate<"!FeatureGCN3Encoding">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>; def isGFX7Only : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX10Insts))>; def isGFX7GFX10 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, - AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>; def isGFX7GFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; + AssemblerPredicate<(all_of FeatureGFX7GFX8GFX9Insts)>; def isGFX6GFX7GFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"!FeatureGFX10Insts">; + AssemblerPredicate<(all_of (not FeatureGFX10Insts))>; def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, - AssemblerPredicate<"FeatureCIInsts">; + AssemblerPredicate<(all_of FeatureCIInsts)>; def isGFX8Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGFX8Insts">; + AssemblerPredicate<(all_of FeatureGFX8Insts)>; def isGFX8Only : Predicate<"Subtarget->getGeneration() ==" "AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate <"FeatureVolcanicIslands">; + AssemblerPredicate <(all_of FeatureVolcanicIslands)>; def isGFX9Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"FeatureGFX9Insts">; + AssemblerPredicate<(all_of FeatureGFX9Insts)>; def isGFX9Only : Predicate < "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">; + AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>; def isGFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; + AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>; def isGFX10Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, - AssemblerPredicate<"FeatureGFX10Insts">; + AssemblerPredicate<(all_of FeatureGFX10Insts)>; def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, - AssemblerPredicate<"FeatureFlatAddressSpace">; + AssemblerPredicate<(all_of FeatureFlatAddressSpace)>; def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, - AssemblerPredicate<"FeatureFlatGlobalInsts">; + AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>; def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, - AssemblerPredicate<"FeatureFlatScratchInsts">; + AssemblerPredicate<(all_of FeatureFlatScratchInsts)>; def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, - AssemblerPredicate<"FeatureScalarFlatScratchInsts">; + AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, - AssemblerPredicate<"FeatureGFX9Insts">; + AssemblerPredicate<(all_of FeatureGFX9Insts)>; + +def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">, + AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>; def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, - AssemblerPredicate<"FeatureUnpackedD16VMem">; + AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>; def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, - AssemblerPredicate<"!FeatureUnpackedD16VMem">; + AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>; def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, - AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">; + AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, - AssemblerPredicate<"FeatureGFX9Insts">; + AssemblerPredicate<(all_of FeatureGFX9Insts)>; + +def HasLDSFPAtomics : Predicate<"Subtarget->hasLDSFPAtomics()">, + AssemblerPredicate<(all_of FeatureGFX8Insts)>; def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, - AssemblerPredicate<"FeatureAddNoCarryInsts">; + AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>; def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, - AssemblerPredicate<"Feature16BitInsts">; + AssemblerPredicate<(all_of Feature16BitInsts)>; def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, - AssemblerPredicate<"FeatureVOP3P">; + AssemblerPredicate<(all_of FeatureVOP3P)>; + +def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">; +def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">; def HasSDWA : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; + AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>; def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; + AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts,FeatureSDWA)>; def HasSDWA10 : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureSDWA)>; def HasDPP : Predicate<"Subtarget->hasDPP()">, - AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; + AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureDPP)>; def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, - AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>; def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, - AssemblerPredicate<"FeatureR128A16">; + AssemblerPredicate<(all_of FeatureR128A16)>; + +def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">, + AssemblerPredicate<(all_of FeatureGFX10A16)>; + +def HasG16 : Predicate<"Subtarget->hasG16()">, + AssemblerPredicate<(all_of FeatureG16)>; def HasDPP16 : Predicate<"Subtarget->hasDPP()">, - AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">; + AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>; def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, - AssemblerPredicate<"FeatureIntClamp">; + AssemblerPredicate<(all_of FeatureIntClamp)>; def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, - AssemblerPredicate<"FeatureMadMixInsts">; + AssemblerPredicate<(all_of FeatureMadMixInsts)>; def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, - AssemblerPredicate<"FeatureScalarStores">; + AssemblerPredicate<(all_of FeatureScalarStores)>; def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, - AssemblerPredicate<"FeatureScalarAtomics">; + AssemblerPredicate<(all_of FeatureScalarAtomics)>; def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, - AssemblerPredicate<"FeatureNoSdstCMPX">; + AssemblerPredicate<(all_of FeatureNoSdstCMPX)>; def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, - AssemblerPredicate<"!FeatureNoSdstCMPX">; + AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>; def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, - AssemblerPredicate<"FeatureVGPRIndexMode">; + AssemblerPredicate<(all_of FeatureVGPRIndexMode)>; def HasMovrel : Predicate<"Subtarget->hasMovrel()">, - AssemblerPredicate<"FeatureMovrel">; + AssemblerPredicate<(all_of FeatureMovrel)>; def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, - AssemblerPredicate<"FeatureFmaMixInsts">; + AssemblerPredicate<(all_of FeatureFmaMixInsts)>; def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, - AssemblerPredicate<"FeatureDLInsts">; + AssemblerPredicate<(all_of FeatureDLInsts)>; def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">, - AssemblerPredicate<"FeatureDot1Insts">; + AssemblerPredicate<(all_of FeatureDot1Insts)>; def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">, - AssemblerPredicate<"FeatureDot2Insts">; + AssemblerPredicate<(all_of FeatureDot2Insts)>; def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">, - AssemblerPredicate<"FeatureDot3Insts">; + AssemblerPredicate<(all_of FeatureDot3Insts)>; def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">, - AssemblerPredicate<"FeatureDot4Insts">; + AssemblerPredicate<(all_of FeatureDot4Insts)>; def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">, - AssemblerPredicate<"FeatureDot5Insts">; + AssemblerPredicate<(all_of FeatureDot5Insts)>; def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, - AssemblerPredicate<"FeatureDot6Insts">; + AssemblerPredicate<(all_of FeatureDot6Insts)>; + +def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, + AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, - AssemblerPredicate<"FeatureMAIInsts">; + AssemblerPredicate<(all_of FeatureMAIInsts)>; + +def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, + AssemblerPredicate<(all_of FeatureSMemTimeInst)>; + +def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">; def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, - AssemblerPredicate<"FeaturePkFmacF16Inst">; + AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; + +def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, + AssemblerPredicate<(all_of FeatureMadMacF32Insts)>; def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, - AssemblerPredicate<"FeatureAtomicFaddInsts">; + AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>; + +def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">, + AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>; + +def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, + AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">, - AssemblerPredicate<"FeatureOffset3fBug">; + AssemblerPredicate<(all_of FeatureOffset3fBug)>; def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; @@ -1165,7 +1255,7 @@ def EnableLateCFGStructurize : Predicate< include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" -include "AMDGPURegisterInfo.td" +include "SIRegisterInfo.td" include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" include "SIInstrInfo.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index bba132c3bc46f..bb2aba0449748 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -91,12 +91,16 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI, bool OrLocal) { + unsigned AS = Loc.Ptr->getType()->getPointerAddressSpace(); + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return true; + const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); - unsigned AS = Base->getType()->getPointerAddressSpace(); + AS = Base->getType()->getPointerAddressSpace(); if (AS == AMDGPUAS::CONSTANT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; - } if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) { if (GV->isConstant()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index fb722920900f0..fd8889ea5c0dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -48,10 +48,6 @@ public: AAQueryInfo &AAQI); bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI, bool OrLocal); - -private: - bool Aliases(const MDNode *A, const MDNode *B) const; - bool PathAliases(const MDNode *A, const MDNode *B) const; }; /// Analysis pass providing a never-invalidated alias analysis result. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index ff2bda6bed533..22947544ac07f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -71,6 +71,13 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers( if (Instruction *I = dyn_cast<Instruction>(U)) { Function *F = I->getParent()->getParent(); if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + // FIXME: This is a horrible hack. We should always respect noinline, + // and just let us hit the error when we can't handle this. + // + // Unfortunately, clang adds noinline to all functions at -O0. We have + // to override this here. until that's fixed. + F->removeFnAttr(Attribute::NoInline); + FuncsToAlwaysInline.insert(F); Stack.push_back(F); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index e72b3f4fde633..625074569cfa4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -71,7 +70,8 @@ public: static bool visitConstantExpr(const ConstantExpr *CE); static bool visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited); + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, + bool HasApertureRegs); }; } // end anonymous namespace @@ -93,6 +93,14 @@ static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { return castRequiresQueuePtr(ASC->getSrcAddressSpace()); } +static bool isDSAddress(const Constant *C) { + const GlobalValue *GV = dyn_cast<GlobalValue>(C); + if (!GV) + return false; + unsigned AS = GV->getAddressSpace(); + return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; +} + bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); @@ -104,7 +112,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + bool IsFunc, bool HasApertureRegs) { if (!ConstantExprVisited.insert(EntryC).second) return false; @@ -115,9 +124,13 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( while (!Stack.empty()) { const Constant *C = Stack.pop_back_val(); + // We need to trap on DS globals in non-entry functions. + if (IsFunc && isDSAddress(C)) + return true; + // Check this constant expression. if (const auto *CE = dyn_cast<ConstantExpr>(C)) { - if (visitConstantExpr(CE)) + if (!HasApertureRegs && visitConstantExpr(CE)) return true; } @@ -202,7 +215,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", - "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; + "amdgpu-implicitarg-ptr"}; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; @@ -263,10 +276,10 @@ bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); - bool HasFlat = ST.hasFlatAddressSpace(); bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet<const Constant *, 8> ConstantExprVisited; + bool HaveStackObjects = false; bool Changed = false; bool NeedQueuePtr = false; bool HaveCall = false; @@ -274,13 +287,18 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { for (BasicBlock &BB : F) { for (Instruction &I : BB) { - CallSite CS(&I); - if (CS) { - Function *Callee = CS.getCalledFunction(); + if (isa<AllocaInst>(I)) { + HaveStackObjects = true; + continue; + } + + if (auto *CB = dyn_cast<CallBase>(&I)) { + const Function *Callee = + dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); // TODO: Do something with indirect calls. if (!Callee) { - if (!CS.isInlineAsm()) + if (!CB->isInlineAsm()) HaveCall = true; continue; } @@ -292,20 +310,25 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { Changed = true; } else { bool NonKernelOnly = false; - StringRef AttrName = intrinsicToAttrName(IID, - NonKernelOnly, NeedQueuePtr); - if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { - F.addFnAttr(AttrName); - Changed = true; + + if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { + F.addFnAttr("amdgpu-kernarg-segment-ptr"); + } else { + StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, + NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } } } } - if (NeedQueuePtr || HasApertureRegs) + if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) continue; if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC)) { + if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { NeedQueuePtr = true; continue; } @@ -316,7 +339,8 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, + HasApertureRegs)) { NeedQueuePtr = true; break; } @@ -332,8 +356,13 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { // TODO: We could refine this to captured pointers that could possibly be // accessed by flat instructions. For now this is mostly a poor way of // estimating whether there are calls before argument lowering. - if (HasFlat && !IsFunc && HaveCall) { - F.addFnAttr("amdgpu-flat-scratch"); + if (!IsFunc && HaveCall) { + F.addFnAttr("amdgpu-calls"); + Changed = true; + } + + if (HaveStackObjects) { + F.addFnAttr("amdgpu-stack-objects"); Changed = true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 6fb507083cef1..b09e92c07f9ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -35,7 +36,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap<Value*, GetElementPtrInst*> noClobberClones; - bool isKernelFunc; + bool isEntryFunc; public: static char ID; @@ -127,11 +128,10 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { auto isGlobalLoad = [&](LoadInst &Load)->bool { return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }; - // We're tracking up to the Function boundaries - // We cannot go beyond because of FunctionPass restrictions - // Thus we can ensure that memory not clobbered for memory - // operations that live in kernel only. - bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I); + // We're tracking up to the Function boundaries, and cannot go beyond because + // of FunctionPass restrictions. We can ensure that is memory not clobbered + // for memory operations that are live in to entry points only. + bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I); Instruction *PtrI = dyn_cast<Instruction>(Ptr); if (!PtrI && NotClobbered && isGlobalLoad(I)) { if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) { @@ -170,7 +170,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { DA = &getAnalysis<LegacyDivergenceAnalysis>(); MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); noClobberClones.clear(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index 99a01ca3a2fda..d078fc147a36a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -8,6 +8,8 @@ #include "AMDGPU.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" #include "llvm/Support/NativeFormatting.h" #include "llvm/Support/raw_ostream.h" @@ -43,6 +45,10 @@ char AMDGPUArgumentUsageInfo::ID = 0; const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; +// Hardcoded registers from fixed function ABI +const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo + = AMDGPUFunctionArgInfo::fixedABILayout(); + bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { return false; } @@ -77,59 +83,102 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { } } -std::pair<const ArgDescriptor *, const TargetRegisterClass *> +std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> AMDGPUFunctionArgInfo::getPreloadedValue( - AMDGPUFunctionArgInfo::PreloadedValue Value) const { + AMDGPUFunctionArgInfo::PreloadedValue Value) const { switch (Value) { case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: { - return std::make_pair( - PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr, - &AMDGPU::SGPR_128RegClass); + return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer + : nullptr, + &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32)); } case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR: - return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr, + &AMDGPU::SGPR_64RegClass, + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: - return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr, - &AMDGPU::SGPR_32RegClass); - + return std::make_tuple(WorkGroupIDX ? &WorkGroupIDX : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: - return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr, - &AMDGPU::SGPR_32RegClass); + return std::make_tuple(WorkGroupIDY ? &WorkGroupIDY : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: - return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, - &AMDGPU::SGPR_32RegClass); + return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - return std::make_pair( - PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, - &AMDGPU::SGPR_32RegClass); + return std::make_tuple( + PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR: - return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, + &AMDGPU::SGPR_64RegClass, + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR: - return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(ImplicitArgPtr ? &ImplicitArgPtr : nullptr, + &AMDGPU::SGPR_64RegClass, + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); case AMDGPUFunctionArgInfo::DISPATCH_ID: - return std::make_pair(DispatchID ? &DispatchID : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(DispatchID ? &DispatchID : nullptr, + &AMDGPU::SGPR_64RegClass, LLT::scalar(64)); case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT: - return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(FlatScratchInit ? &FlatScratchInit : nullptr, + &AMDGPU::SGPR_64RegClass, LLT::scalar(64)); case AMDGPUFunctionArgInfo::DISPATCH_PTR: - return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(DispatchPtr ? &DispatchPtr : nullptr, + &AMDGPU::SGPR_64RegClass, + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); case AMDGPUFunctionArgInfo::QUEUE_PTR: - return std::make_pair(QueuePtr ? &QueuePtr : nullptr, - &AMDGPU::SGPR_64RegClass); + return std::make_tuple(QueuePtr ? &QueuePtr : nullptr, + &AMDGPU::SGPR_64RegClass, + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); case AMDGPUFunctionArgInfo::WORKITEM_ID_X: - return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr, - &AMDGPU::VGPR_32RegClass); + return std::make_tuple(WorkItemIDX ? &WorkItemIDX : nullptr, + &AMDGPU::VGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::WORKITEM_ID_Y: - return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr, - &AMDGPU::VGPR_32RegClass); + return std::make_tuple(WorkItemIDY ? &WorkItemIDY : nullptr, + &AMDGPU::VGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::WORKITEM_ID_Z: - return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr, - &AMDGPU::VGPR_32RegClass); + return std::make_tuple(WorkItemIDZ ? &WorkItemIDZ : nullptr, + &AMDGPU::VGPR_32RegClass, LLT::scalar(32)); } llvm_unreachable("unexpected preloaded value type"); } + +constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { + AMDGPUFunctionArgInfo AI; + AI.PrivateSegmentBuffer + = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3); + AI.DispatchPtr = ArgDescriptor::createRegister(AMDGPU::SGPR4_SGPR5); + AI.QueuePtr = ArgDescriptor::createRegister(AMDGPU::SGPR6_SGPR7); + + // Do not pass kernarg segment pointer, only pass increment version in its + // place. + AI.ImplicitArgPtr = ArgDescriptor::createRegister(AMDGPU::SGPR8_SGPR9); + AI.DispatchID = ArgDescriptor::createRegister(AMDGPU::SGPR10_SGPR11); + + // Skip FlatScratchInit/PrivateSegmentSize + AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12); + AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13); + AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14); + + const unsigned Mask = 0x3ff; + AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask); + AI.WorkItemIDY = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 10); + AI.WorkItemIDZ = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 20); + return AI; +} + +const AMDGPUFunctionArgInfo & +AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { + auto I = ArgInfoMap.find(&F); + if (I == ArgInfoMap.end()) { + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + return FixedABIFunctionInfo; + + // Without the fixed ABI, we assume no function has special inputs. + assert(F.isDeclaration()); + return ExternFunctionInfo; + } + + return I->second; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index f0e7ee910f957..576e6cfe929e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -11,15 +11,13 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" -#include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Support/LowLevelTypeImpl.h" namespace llvm { class Function; class raw_ostream; -class GCNSubtarget; -class TargetMachine; class TargetRegisterClass; class TargetRegisterInfo; @@ -40,19 +38,22 @@ private: bool IsSet : 1; public: - ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, + constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, bool IsSet = false) : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} - static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { + static constexpr ArgDescriptor createRegister(Register Reg, + unsigned Mask = ~0u) { return ArgDescriptor(Reg, Mask, false, true); } - static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { + static constexpr ArgDescriptor createStack(unsigned Offset, + unsigned Mask = ~0u) { return ArgDescriptor(Offset, Mask, true, true); } - static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { + static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, + unsigned Mask) { return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } @@ -141,25 +142,29 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor ImplicitArgPtr; // Input registers for non-HSA ABI - ArgDescriptor ImplicitBufferPtr = 0; + ArgDescriptor ImplicitBufferPtr; // VGPRs inputs. These are always v0, v1 and v2 for entry functions. ArgDescriptor WorkItemIDX; ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; - std::pair<const ArgDescriptor *, const TargetRegisterClass *> + std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> getPreloadedValue(PreloadedValue Value) const; + + static constexpr AMDGPUFunctionArgInfo fixedABILayout(); }; class AMDGPUArgumentUsageInfo : public ImmutablePass { private: - static const AMDGPUFunctionArgInfo ExternFunctionInfo; DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; public: static char ID; + static const AMDGPUFunctionArgInfo ExternFunctionInfo; + static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; + AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -175,15 +180,7 @@ public: ArgInfoMap[&F] = ArgInfo; } - const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { - auto I = ArgInfoMap.find(&F); - if (I == ArgInfoMap.end()) { - assert(F.isDeclaration()); - return ExternFunctionInfo; - } - - return I->second; - } + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 9e07b4d252b78..eef8fe2fc3b70 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -49,9 +49,25 @@ using namespace llvm; using namespace llvm::AMDGPU; using namespace llvm::AMDGPU::HSAMD; -// TODO: This should get the default rounding mode from the kernel. We just set -// the default here, but this could change if the OpenCL rounding mode pragmas -// are used. +// We need to tell the runtime some amount ahead of time if we don't know the +// true stack size. Assume a smaller number if this is only due to dynamic / +// non-entry block allocas. +static cl::opt<uint32_t> AssumedStackSizeForExternalCall( + "amdgpu-assume-external-call-stack-size", + cl::desc("Assumed stack use of any external call (in bytes)"), + cl::Hidden, + cl::init(16384)); + +static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( + "amdgpu-assume-dynamic-stack-object-size", + cl::desc("Assumed extra stack use if there are any " + "variable sized objects (in bytes)"), + cl::Hidden, + cl::init(4096)); + +// This should get the default rounding mode from the kernel. We just set the +// default here, but this could change if the OpenCL rounding mode pragmas are +// used. // // The denormal mode here should match what is reported by the OpenCL runtime // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but @@ -70,18 +86,10 @@ using namespace llvm::AMDGPU::HSAMD; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { - - // TODO: Is there any real use for the flush in only / flush out only modes? - uint32_t FP32Denormals = - Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - uint32_t FP64Denormals = - Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | - FP_DENORM_MODE_SP(FP32Denormals) | - FP_DENORM_MODE_DP(FP64Denormals); + FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | + FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); } static AsmPrinter * @@ -120,7 +128,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); } -void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { +void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) { std::string ExpectedTarget; raw_string_ostream ExpectedTargetOS(ExpectedTarget); @@ -152,7 +160,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { +void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; @@ -188,7 +196,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); } -void AMDGPUAsmPrinter::EmitFunctionBodyStart() { +void AMDGPUAsmPrinter::emitFunctionBodyStart() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; @@ -207,7 +215,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } -void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { +void AMDGPUAsmPrinter::emitFunctionBodyEnd() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; @@ -226,7 +234,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { // CP microcode requires the kernel descriptor to be allocated on 64 byte // alignment. - Streamer.EmitValueToAlignment(64, 0, 1, 0); + Streamer.emitValueToAlignment(64, 0, 1, 0); if (ReadOnlySection.getAlignment() < 64) ReadOnlySection.setAlignment(Align(64)); @@ -247,10 +255,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { Streamer.PopSection(); } -void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { +void AMDGPUAsmPrinter::emitFunctionEntryLabel() { if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) && TM.getTargetTriple().getOS() == Triple::AMDHSA) { - AsmPrinter::EmitFunctionEntryLabel(); + AsmPrinter::emitFunctionEntryLabel(); return; } @@ -269,10 +277,10 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { HexLines.push_back(""); } - AsmPrinter::EmitFunctionEntryLabel(); + AsmPrinter::emitFunctionEntryLabel(); } -void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { +void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( @@ -281,10 +289,10 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); HexLines.push_back(""); } - AsmPrinter::EmitBasicBlockStart(MBB); + AsmPrinter::emitBasicBlockStart(MBB); } -void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { +void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { OutContext.reportError({}, @@ -307,18 +315,16 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { const DataLayout &DL = GV->getParent()->getDataLayout(); uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); - unsigned Align = GV->getAlignment(); - if (!Align) - Align = 4; + Align Alignment = GV->getAlign().getValueOr(Align(4)); - EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); - EmitLinkage(GV, GVSym); + emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + emitLinkage(GV, GVSym); if (auto TS = getTargetStreamer()) - TS->emitAMDGPULDS(GVSym, Size, Align); + TS->emitAMDGPULDS(GVSym, Size, Alignment); return; } - AsmPrinter::EmitGlobalVariable(GV); + AsmPrinter::emitGlobalVariable(GV); } bool AMDGPUAsmPrinter::doFinalization(Module &M) { @@ -468,7 +474,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { HexLines.clear(); DisasmLineMaxLen = 0; - EmitFunctionBody(); + emitFunctionBody(); if (isVerbose()) { MCSectionELF *CommentSection = @@ -549,7 +555,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (DumpCodeInstEmitter) { OutStreamer->SwitchSection( - Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { std::string Comment = "\n"; @@ -558,8 +564,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Comment += " ; " + HexLines[i] + "\n"; } - OutStreamer->EmitBytes(StringRef(DisasmLines[i])); - OutStreamer->EmitBytes(StringRef(Comment)); + OutStreamer->emitBytes(StringRef(DisasmLines[i])); + OutStreamer->emitBytes(StringRef(Comment)); } } @@ -609,6 +615,15 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( return std::max(NumVGPR, NumAGPR); } +static const Function *getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + + return cast<Function>(Op.getGlobal()); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -636,11 +651,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.UsesFlatScratch = false; } - Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (MFI->isStackRealigned()) - Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); + // Assume a big number if there are any unknown sized objects. + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + if (Info.HasDynamicallySizedStack) + Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; + + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); @@ -715,6 +734,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::SRC_PRIVATE_BASE: case AMDGPU::SRC_PRIVATE_LIMIT: case AMDGPU::SGPR_NULL: + case AMDGPU::MODE: continue; case AMDGPU::SRC_POPS_EXITING_WAVE_ID: @@ -727,6 +747,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::VCC: case AMDGPU::VCC_LO: case AMDGPU::VCC_HI: + case AMDGPU::VCC_LO_LO16: + case AMDGPU::VCC_LO_HI16: + case AMDGPU::VCC_HI_LO16: + case AMDGPU::VCC_HI_HI16: Info.UsesVCC = true; continue; @@ -764,15 +788,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( break; } - if (AMDGPU::SReg_32RegClass.contains(Reg)) { + if (AMDGPU::SReg_32RegClass.contains(Reg) || + AMDGPU::SReg_LO16RegClass.contains(Reg) || + AMDGPU::SGPR_HI16RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && "trap handler registers should not be used"); IsSGPR = true; Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || + AMDGPU::VGPR_LO16RegClass.contains(Reg) || + AMDGPU::VGPR_HI16RegClass.contains(Reg)) { IsSGPR = false; Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) { + } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || + AMDGPU::AGPR_LO16RegClass.contains(Reg)) { IsSGPR = false; IsAGPR = true; Width = 1; @@ -794,6 +823,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { IsSGPR = true; Width = 3; + } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -812,6 +845,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { IsSGPR = true; Width = 5; + } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 5; + } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { + IsSGPR = false; + Width = 6; + } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { + IsSGPR = true; + Width = 6; + } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 6; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -820,6 +867,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { IsSGPR = false; Width = 8; + } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 8; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && "trap handler registers should not be used"); @@ -862,8 +913,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = cast<Function>(CalleeOp->getGlobal()); - if (Callee->isDeclaration()) { + + const Function *Callee = getCalleeFunction(*CalleeOp); + if (!Callee || Callee->isDeclaration()) { // If this is a call to an external function, we can't do much. Make // conservative guesses. @@ -874,7 +926,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( MaxVGPR = std::max(MaxVGPR, 23); MaxAGPR = std::max(MaxAGPR, 23); - CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); + CalleeFrameSize = std::max(CalleeFrameSize, + static_cast<uint64_t>(AssumedStackSizeForExternalCall)); + Info.UsesVCC = true; Info.UsesFlatScratch = ST.hasFlatAddressSpace(); Info.HasDynamicallySizedStack = true; @@ -906,7 +960,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasRecursion |= I->second.HasRecursion; } - if (!Callee->doesNotRecurse()) + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) Info.HasRecursion = true; } } @@ -1108,7 +1163,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); - ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU); } @@ -1132,40 +1187,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); - OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); + OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1); - OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); + OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); + OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); - OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); + OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks)); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + OutStreamer->emitInt32(RsrcReg); + OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue( + OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); + OutStreamer->emitIntValue( S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); - OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); - OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); + OutStreamer->emitInt32( + S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks)); + OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); + OutStreamer->emitInt32(MFI->getPSInputEnable()); + OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); + OutStreamer->emitInt32(MFI->getPSInputAddr()); } - OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); - OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); - OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); - OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); + OutStreamer->emitInt32(R_SPILLED_SGPRS); + OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); + OutStreamer->emitInt32(R_SPILLED_VGPRS); + OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); } // This is the equivalent of EmitProgramInfoSI above, but for when the OS type @@ -1304,7 +1360,18 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, *MF->getSubtarget().getRegisterInfo()); return false; + } else if (MO.isImm()) { + int64_t Val = MO.getImm(); + if (AMDGPU::isInlinableIntLiteral(Val)) { + O << Val; + } else if (isUInt<16>(Val)) { + O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); + } else if (isUInt<32>(Val)) { + O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); + } else { + O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); + } + return false; } - return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index c50c19a4609c6..54e8338ab4b04 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -121,21 +121,21 @@ public: const MachineInstr *MI); /// Implemented in AMDGPUMCInstLower.cpp - void EmitInstruction(const MachineInstr *MI) override; + void emitInstruction(const MachineInstr *MI) override; - void EmitFunctionBodyStart() override; + void emitFunctionBodyStart() override; - void EmitFunctionBodyEnd() override; + void emitFunctionBodyEnd() override; - void EmitFunctionEntryLabel() override; + void emitFunctionEntryLabel() override; - void EmitBasicBlockStart(const MachineBasicBlock &MBB) override; + void emitBasicBlockStart(const MachineBasicBlock &MBB) override; - void EmitGlobalVariable(const GlobalVariable *GV) override; + void emitGlobalVariable(const GlobalVariable *GV) override; - void EmitStartOfAsmFile(Module &M) override; + void emitStartOfAsmFile(Module &M) override; - void EmitEndOfAsmFile(Module &M) override; + void emitEndOfAsmFile(Module &M) override; bool isBlockOnlyReachableByFallthrough( const MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 59aa0ea98aa79..c9d25d4250d55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -438,7 +438,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Type *const Ty = I.getType(); const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); - Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); + auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2); // This is the value in the atomic operation we need to combine in order to // reduce the number of atomic operations. @@ -447,9 +447,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); - CallInst *const Ballot = B.CreateIntrinsic( - Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()}, - {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); + CallInst *const Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index c657ca71bfdf4..05a4e3462a263 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUISelLowering.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -59,6 +60,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { } else ExtReg = extendRegister(ValVReg, VA); + // If this is a scalar return, insert a readfirstlane just in case the value + // ends up in a VGPR. + // FIXME: Assert this is a shader return. + const SIRegisterInfo *TRI + = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); + if (TRI->isSGPRReg(MRI, PhysReg)) { + auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, + {MRI.getType(ExtReg)}, false) + .addReg(ExtReg); + ExtReg = ToSGPR.getReg(0); + } + MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); } @@ -84,11 +97,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - Register AddrReg = MRI.createGenericVirtualRegister( - LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); - MIRBuilder.buildFrameIndex(AddrReg, FI); + auto AddrReg = MIRBuilder.buildFrameIndex( + LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); StackUsed = std::max(StackUsed, Size + Offset); - return AddrReg; + return AddrReg.getReg(0); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -119,9 +131,12 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); + // FIXME: Get alignment - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); + auto MMO = MF.getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } @@ -150,10 +165,26 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI) { } +// FIXME: Compatability shim +static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { + switch (MIOpc) { + case TargetOpcode::G_SEXT: + return ISD::SIGN_EXTEND; + case TargetOpcode::G_ZEXT: + return ISD::ZERO_EXTEND; + case TargetOpcode::G_ANYEXT: + return ISD::ANY_EXTEND; + default: + llvm_unreachable("not an extend opcode"); + } +} + void AMDGPUCallLowering::splitToValueTypes( - const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, - SplitArgTy PerformArgSplit) const { + MachineIRBuilder &B, + const ArgInfo &OrigArg, unsigned OrigArgIdx, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, CallingConv::ID CallConv, + SplitArgTy PerformArgSplit) const { const SITargetLowering &TLI = *getTLI<SITargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); @@ -167,28 +198,46 @@ void AMDGPUCallLowering::splitToValueTypes( int SplitIdx = 0; for (EVT VT : SplitVTs) { - unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + Register Reg = OrigArg.Regs[SplitIdx]; Type *Ty = VT.getTypeForEVT(Ctx); + LLT LLTy = getLLTForType(*Ty, DL); + if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) { + unsigned ExtendOp = TargetOpcode::G_ANYEXT; + if (OrigArg.Flags[0].isSExt()) { + assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); + ExtendOp = TargetOpcode::G_SEXT; + } else if (OrigArg.Flags[0].isZExt()) { + assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); + ExtendOp = TargetOpcode::G_ZEXT; + } + EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, + extOpcodeToISDExtOpcode(ExtendOp)); + if (ExtVT != VT) { + VT = ExtVT; + Ty = ExtVT.getTypeForEVT(Ctx); + LLTy = getLLTForType(*Ty, DL); + Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); + } + } + + unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); + MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); if (NumParts == 1) { // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). - SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty, - OrigArg.Flags, OrigArg.IsFixed); + SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); ++SplitIdx; continue; } - LLT LLTy = getLLTForType(*Ty, DL); - SmallVector<Register, 8> SplitRegs; - - EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); - Type *PartTy = PartVT.getTypeForEVT(Ctx); + Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); LLT PartLLT = getLLTForType(*PartTy, DL); + MachineRegisterInfo &MRI = *B.getMRI(); // FIXME: Should we be reporting all of the part registers for a single // argument, and let handleAssignments take care of the repacking? @@ -198,7 +247,7 @@ void AMDGPUCallLowering::splitToValueTypes( SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); } - PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx); + PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); ++SplitIdx; } @@ -218,13 +267,11 @@ static LLT getMultipleType(LLT OrigTy, int Factor) { static void unpackRegsToOrigType(MachineIRBuilder &B, ArrayRef<Register> DstRegs, Register SrcReg, + const CallLowering::ArgInfo &Info, LLT SrcTy, LLT PartTy) { assert(DstRegs.size() > 1 && "Nothing to unpack"); - MachineFunction &MF = B.getMF(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const unsigned SrcSize = SrcTy.getSizeInBits(); const unsigned PartSize = PartTy.getSizeInBits(); @@ -248,12 +295,11 @@ static void unpackRegsToOrigType(MachineIRBuilder &B, LLT BigTy = getMultipleType(PartTy, NumRoundedParts); auto ImpDef = B.buildUndef(BigTy); - Register BigReg = MRI.createGenericVirtualRegister(BigTy); - B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0); + auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0); int64_t Offset = 0; for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) - B.buildExtract(DstRegs[i], BigReg, Offset); + B.buildExtract(DstRegs[i], Big, Offset); } /// Lower the return value for the already existing \p Ret. This assumes that @@ -267,24 +313,26 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, auto &MF = B.getMF(); const auto &F = MF.getFunction(); const DataLayout &DL = MF.getDataLayout(); + MachineRegisterInfo *MRI = B.getMRI(); CallingConv::ID CC = F.getCallingConv(); const SITargetLowering &TLI = *getTLI<SITargetLowering>(); - MachineRegisterInfo &MRI = MF.getRegInfo(); ArgInfo OrigRetInfo(VRegs, Val->getType()); setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 4> SplitRetInfos; splitToValueTypes( - OrigRetInfo, SplitRetInfos, DL, MRI, CC, - [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { - unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT); + B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC, + [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, + int VTSplitIdx) { + unpackRegsToOrigType(B, Regs, SrcReg, + SplitRetInfos[VTSplitIdx], + LLTy, PartLLT); }); CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); - - OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn); + OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); return handleAssignments(B, SplitRetInfos, RetHandler); } @@ -309,7 +357,7 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, return true; } - auto const &ST = B.getMF().getSubtarget<GCNSubtarget>(); + auto const &ST = MF.getSubtarget<GCNSubtarget>(); unsigned ReturnOpc = IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; @@ -348,22 +396,17 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); - Register DstReg = MRI.createGenericVirtualRegister(PtrType); Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); - Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); - B.buildConstant(OffsetReg, Offset); + auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); - B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); - - return DstReg; + return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); } -void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, - Type *ParamTy, uint64_t Offset, - unsigned Align, +void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, + uint64_t Offset, Align Alignment, Register DstReg) const { MachineFunction &MF = B.getMF(); const Function &F = MF.getFunction(); @@ -372,11 +415,11 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, unsigned TypeSize = DL.getTypeStoreSize(ParamTy); Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); - MachineMemOperand *MMO = - MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - TypeSize, Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + TypeSize, Alignment); B.buildLoad(DstReg, PtrReg, *MMO); } @@ -389,19 +432,19 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, SIMachineFunctionInfo &Info) { // FIXME: How should these inputs interact with inreg / custom SGPR inputs? if (Info.hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } if (Info.hasDispatchPtr()) { - unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); + Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } if (Info.hasQueuePtr()) { - unsigned QueuePtrReg = Info.addQueuePtr(TRI); + Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } @@ -418,13 +461,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } if (Info.hasDispatchID()) { - unsigned DispatchIDReg = Info.addDispatchID(TRI); + Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } if (Info.hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); + Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } @@ -451,7 +494,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); unsigned i = 0; - const unsigned KernArgBaseAlign = 16; + const Align KernArgBaseAlign(16); const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); uint64_t ExplicitArgOffset = 0; @@ -462,19 +505,24 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( if (AllocSize == 0) continue; - unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + Align ABIAlign = DL.getABITypeAlign(ArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + if (Arg.use_empty()) { + ++i; + continue; + } + ArrayRef<Register> OrigArgRegs = VRegs[i]; Register ArgReg = OrigArgRegs.size() == 1 ? OrigArgRegs[0] : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); - unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); - ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg); + + Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); + lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); if (OrigArgRegs.size() > 1) unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); ++i; @@ -485,38 +533,72 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( return true; } +/// Pack values \p SrcRegs to cover the vector type result \p DstRegs. +static MachineInstrBuilder mergeVectorRegsToResultRegs( + MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT LLTy = MRI.getType(DstRegs[0]); + LLT PartLLT = MRI.getType(SrcRegs[0]); + + // Deal with v3s16 split into v2s16 + LLT LCMTy = getLCMType(LLTy, PartLLT); + if (LCMTy == LLTy) { + // Common case where no padding is needed. + assert(DstRegs.size() == 1); + return B.buildConcatVectors(DstRegs[0], SrcRegs); + } + + const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); + Register Undef = B.buildUndef(PartLLT).getReg(0); + + // Build vector of undefs. + SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); + + // Replace the first sources with the real registers. + std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); + + auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); + int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); + + SmallVector<Register, 8> PadDstRegs(NumDst); + std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); + + // Create the excess dead defs for the unmerge. + for (int I = DstRegs.size(); I != NumDst; ++I) + PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); + + return B.buildUnmerge(PadDstRegs, Widened); +} + // TODO: Move this to generic code static void packSplitRegsToOrigType(MachineIRBuilder &B, ArrayRef<Register> OrigRegs, ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) { - if (!LLTy.isVector() && !PartLLT.isVector()) { - B.buildMerge(OrigRegs[0], Regs); - return; - } + MachineRegisterInfo &MRI = *B.getMRI(); - if (LLTy.isVector() && PartLLT.isVector()) { - assert(LLTy.getElementType() == PartLLT.getElementType()); + if (!LLTy.isVector() && !PartLLT.isVector()) { + assert(OrigRegs.size() == 1); + LLT OrigTy = MRI.getType(OrigRegs[0]); - int DstElts = LLTy.getNumElements(); - int PartElts = PartLLT.getNumElements(); - if (DstElts % PartElts == 0) - B.buildConcatVectors(OrigRegs[0], Regs); + unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); + if (SrcSize == OrigTy.getSizeInBits()) + B.buildMerge(OrigRegs[0], Regs); else { - // Deal with v3s16 split into v2s16 - assert(PartElts == 2 && DstElts % 2 != 0); - int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); - - LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); - auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs); - B.buildExtract(OrigRegs[0], RoundedConcat, 0); + auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); + B.buildTrunc(OrigRegs[0], Widened); } return; } - MachineRegisterInfo &MRI = *B.getMRI(); + if (LLTy.isVector() && PartLLT.isVector()) { + assert(OrigRegs.size() == 1); + assert(LLTy.getElementType() == PartLLT.getElementType()); + mergeVectorRegsToResultRegs(B, OrigRegs, Regs); + return; + } assert(LLTy.isVector() && !PartLLT.isVector()); @@ -644,13 +726,16 @@ bool AMDGPUCallLowering::lowerFormalArguments( } ArgInfo OrigArg(VRegs[Idx], Arg.getType()); - setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); + const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; + setArgFlags(OrigArg, OrigArgIdx, DL, F); splitToValueTypes( - OrigArg, SplitArgs, DL, MRI, CC, + B, OrigArg, OrigArgIdx, SplitArgs, DL, CC, // FIXME: We should probably be passing multiple registers to // handleAssignments to do this - [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { + [&](ArrayRef<Register> Regs, Register DstReg, + LLT LLTy, LLT PartLLT, int VTSplitIdx) { + assert(DstReg == VRegs[Idx][VTSplitIdx]); packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, LLTy, PartLLT); }); @@ -705,11 +790,17 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!MBB.empty()) B.setInstr(*MBB.begin()); + if (!IsEntryFunc) { + // For the fixed ABI, pass workitem IDs in the last argument register. + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + } + FormalArgHandler Handler(B, MRI, AssignFn); if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) return false; - if (!IsEntryFunc) { + if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { // Special inputs come after user arguments. TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); } @@ -719,8 +810,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h index 53a562586bc06..446619d1502ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -27,14 +27,16 @@ class AMDGPUCallLowering: public CallLowering { uint64_t Offset) const; void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, - unsigned Align, Register DstReg) const; + Align Alignment, Register DstReg) const; /// A function of this type is used to perform value split action. - using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>; + using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>; - void splitToValueTypes(const ArgInfo &OrigArgInfo, + void splitToValueTypes(MachineIRBuilder &B, + const ArgInfo &OrigArgInfo, + unsigned OrigArgIdx, SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, + const DataLayout &DL, CallingConv::ID CallConv, SplitArgTy SplitArg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index f8a54a61aac22..7c83b6dcb44b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -18,7 +18,7 @@ class CCIfExtend<CCAction A> // Calling convention for SI def CC_SI : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -28,7 +28,7 @@ def CC_SI : CallingConv<[ ]>>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -50,7 +50,7 @@ def CC_SI : CallingConv<[ ]>; def RetCC_SI_Shader : CallingConv<[ - CCIfType<[i32] , CCAssignToReg<[ + CCIfType<[i32, i16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< (sequence "VGPR%u", 32, 255) >; +def CSR_AMDGPU_VGPRs : CalleeSavedRegs< + // The CSRs & scratch-registers are interleaved at a split boundary of 8. + (add (sequence "VGPR%u", 40, 47), + (sequence "VGPR%u", 56, 63), + (sequence "VGPR%u", 72, 79), + (sequence "VGPR%u", 88, 95), + (sequence "VGPR%u", 104, 111), + (sequence "VGPR%u", 120, 127), + (sequence "VGPR%u", 136, 143), + (sequence "VGPR%u", 152, 159), + (sequence "VGPR%u", 168, 175), + (sequence "VGPR%u", 184, 191), + (sequence "VGPR%u", 200, 207), + (sequence "VGPR%u", 216, 223), + (sequence "VGPR%u", 232, 239), + (sequence "VGPR%u", 248, 255)) +>; + def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< (sequence "SGPR%u", 32, 105) >; @@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< >; def CSR_AMDGPU_HighRegs : CalleeSavedRegs< - (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) >; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index cf908766caa0d..a795493017402 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" @@ -26,6 +28,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -41,6 +44,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/IntegerDivision.h" #include <cassert> #include <iterator> @@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads( "amdgpu-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), cl::ReallyHidden, - cl::init(true)); + cl::init(false)); static cl::opt<bool> UseMul24Intrin( "amdgpu-codegenprepare-mul24", @@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin( cl::ReallyHidden, cl::init(true)); +// Legalize 64-bit division by using the generic IR expansion. +static cl::opt<bool> ExpandDiv64InIR( + "amdgpu-codegenprepare-expand-div64", + cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + +// Leave all division operations as they are. This supersedes ExpandDiv64InIR +// and is used for testing the legalizer. +static cl::opt<bool> DisableIDivExpand( + "amdgpu-codegenprepare-disable-idiv-expansion", + cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; + DominatorTree *DT = nullptr; LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; const DataLayout *DL = nullptr; @@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// SelectionDAG has an issue where an and asserting the bits are known bool replaceMulWithMul24(BinaryOperator &I) const; + /// Perform same function as equivalently named function in DAGCombiner. Since + /// we expand some divisions here, we need to perform this before obscuring. + bool foldBinOpIntoSelect(BinaryOperator &I) const; + + bool divHasSpecialOptimization(BinaryOperator &I, + Value *Num, Value *Den) const; + int getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool Signed) const; + /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, bool IsDiv, bool IsSigned) const; + Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den, unsigned NumBits, + bool IsDiv, bool IsSigned) const; + /// Expands 32 bit div or rem. Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den) const; + Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den) const; + void expandDivRem64(BinaryOperator &I) const; + /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant @@ -195,7 +233,10 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<LegacyDivergenceAnalysis>(); - AU.setPreservesAll(); + + // FIXME: Division expansion needs to preserve the dominator tree. + if (!ExpandDiv64InIR) + AU.setPreservesAll(); } }; @@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { if (T->isIntegerTy()) return B.getInt32Ty(); - return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); + return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); } bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { @@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); const DataLayout &DL = Mod->getDataLayout(); int TySize = DL.getTypeSizeInBits(Ty); - unsigned Align = I.getAlignment() ? - I.getAlignment() : DL.getABITypeAlignment(Ty); + Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { @@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { static void extractValues(IRBuilder<> &Builder, SmallVectorImpl<Value *> &Values, Value *V) { - VectorType *VT = dyn_cast<VectorType>(V->getType()); + auto *VT = dyn_cast<FixedVectorType>(V->getType()); if (!VT) { Values.push_back(V); return; @@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { return true; } -static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { - const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); - if (!CNum) - return HasDenormals; +// Find a select instruction, which may have been casted. This is mostly to deal +// with cases where i16 selects were promoted here to i32. +static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { + Cast = nullptr; + if (SelectInst *Sel = dyn_cast<SelectInst>(V)) + return Sel; - if (UnsafeDiv) - return true; + if ((Cast = dyn_cast<CastInst>(V))) { + if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0))) + return Sel; + } + + return nullptr; +} + +bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { + // Don't do this unless the old select is going away. We want to eliminate the + // binary operator, not replace a binop with a select. + int SelOpNo = 0; + + CastInst *CastOp; + + // TODO: Should probably try to handle some cases with multiple + // users. Duplicating the select may be profitable for division. + SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp); + if (!Sel || !Sel->hasOneUse()) { + SelOpNo = 1; + Sel = findSelectThroughCast(BO.getOperand(1), CastOp); + } + + if (!Sel || !Sel->hasOneUse()) + return false; + + Constant *CT = dyn_cast<Constant>(Sel->getTrueValue()); + Constant *CF = dyn_cast<Constant>(Sel->getFalseValue()); + Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1)); + if (!CBO || !CT || !CF) + return false; + + if (CastOp) { + if (!CastOp->hasOneUse()) + return false; + CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL); + CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL); + } + + // TODO: Handle special 0/-1 cases DAG combine does, although we only really + // need to handle divisions here. + Constant *FoldedT = SelOpNo ? + ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : + ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); + if (isa<ConstantExpr>(FoldedT)) + return false; + + Constant *FoldedF = SelOpNo ? + ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : + ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); + if (isa<ConstantExpr>(FoldedF)) + return false; + + IRBuilder<> Builder(&BO); + Builder.SetCurrentDebugLocation(BO.getDebugLoc()); + if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO)) + Builder.setFastMathFlags(FPOp->getFastMathFlags()); + + Value *NewSelect = Builder.CreateSelect(Sel->getCondition(), + FoldedT, FoldedF); + NewSelect->takeName(&BO); + BO.replaceAllUsesWith(NewSelect); + BO.eraseFromParent(); + if (CastOp) + CastOp->eraseFromParent(); + Sel->eraseFromParent(); + return true; +} + +// Optimize fdiv with rcp: +// +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. +// +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, + bool RcpIsAccurate, IRBuilder<> &Builder, + Module *Mod) { + + if (!AllowInaccurateRcp && !RcpIsAccurate) + return nullptr; + + Type *Ty = Den->getType(); + if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { + if (AllowInaccurateRcp || RcpIsAccurate) { + if (CLHS->isExactlyValue(1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. + + // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't + // insert rsq intrinsic here. + + // 1.0 / x -> rcp(x) + return Builder.CreateCall(Decl, { Den }); + } + + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); + + // -1.0 / x -> rcp (fneg x) + Value *FNeg = Builder.CreateFNeg(Den); + return Builder.CreateCall(Decl, { FNeg }); + } + } + } - bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); + if (AllowInaccurateRcp) { + Function *Decl = Intrinsic::getDeclaration( + Mod, Intrinsic::amdgcn_rcp, Ty); - // Reciprocal f32 is handled separately without denormals. - return HasDenormals ^ IsOne; + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + Value *Recip = Builder.CreateCall(Decl, { Den }); + return Builder.CreateFMul(Num, Recip); + } + return nullptr; +} + +// optimize with fdiv.fast: +// +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: optimizeWithRcp should be tried first because rcp is the preference. +static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, + bool HasDenormals, IRBuilder<> &Builder, + Module *Mod) { + // fdiv.fast can achieve 2.5 ULP accuracy. + if (ReqdAccuracy < 2.5f) + return nullptr; + + // Only have fdiv.fast for f32. + Type *Ty = Den->getType(); + if (!Ty->isFloatTy()) + return nullptr; + + bool NumIsOne = false; + if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { + if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0)) + NumIsOne = true; + } + + // fdiv does not support denormals. But 1.0/x is always fine to use it. + if (HasDenormals && !NumIsOne) + return nullptr; + + Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); + return Builder.CreateCall(Decl, { Num, Den }); } -// Insert an intrinsic for fast fdiv for safe math situations where we can -// reduce precision. Leave fdiv for situations where the generic node is -// expected to be optimized. +// Optimizations is performed based on fpmath, fast math flags as well as +// denormals to optimize fdiv with either rcp or fdiv.fast. +// +// With rcp: +// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is +// allowed with unsafe-fp-math or afn. +// +// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// +// With fdiv.fast: +// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. +// +// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. +// +// NOTE: rcp is the preference in cases that both are legal. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { - Type *Ty = FDiv.getType(); - if (!Ty->getScalarType()->isFloatTy()) - return false; + Type *Ty = FDiv.getType()->getScalarType(); - MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); - if (!FPMath) + // No intrinsic for fdiv16 if target does not support f16. + if (Ty->isHalfTy() && !ST->has16BitInsts()) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); - float ULP = FPOp->getFPAccuracy(); - if (ULP < 2.5f) - return false; + const float ReqdAccuracy = FPOp->getFPAccuracy(); + // Inaccurate rcp is allowed with unsafe-fp-math or afn. FastMathFlags FMF = FPOp->getFastMathFlags(); - bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || - FMF.allowReciprocal(); + const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); - // With UnsafeDiv node will be optimized to just rcp and mul. - if (UnsafeDiv) - return false; + // rcp_f16 is accurate for !fpmath >= 1.0ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. + const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || + (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); - IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; - - if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially @@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); - Value *NewElt; - - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { + // Try rcp first. + Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, + RcpIsAccurate, Builder, Mod); + if (!NewElt) // Try fdiv.fast. + NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, + HasFP32Denormals, Builder, Mod); + if (!NewElt) // Keep the original. NewElt = Builder.CreateFDiv(NumEltI, DenEltI); - } else { - NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); - } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } - } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) - NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } else { // Scalar FDiv. + // Try rcp first. + NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, + Builder, Mod); + if (!NewFDiv) { // Try fdiv.fast. + NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, + Builder, Mod); + } } if (NewFDiv) { @@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { return getMul64(Builder, LHS, RHS).second; } -// The fractional part of a float is enough to accurately represent up to -// a 24-bit signed integer. -Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den, - bool IsDiv, bool IsSigned) const { - assert(Num->getType()->isIntegerTy(32)); - +/// Figure out how many bits are really needed for this ddivision. \p AtLeast is +/// an optimization hint to bypass the second ComputeNumSignBits call if we the +/// first one is insufficient. Returns -1 on failure. +int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, + Value *Num, Value *Den, + unsigned AtLeast, bool IsSigned) const { const DataLayout &DL = Mod->getDataLayout(); unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); - if (LHSSignBits < 9) - return nullptr; + if (LHSSignBits < AtLeast) + return -1; unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); - if (RHSSignBits < 9) - return nullptr; - + if (RHSSignBits < AtLeast) + return -1; unsigned SignBits = std::min(LHSSignBits, RHSSignBits); - unsigned DivBits = 32 - SignBits; + unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits; if (IsSigned) ++DivBits; + return DivBits; +} - Type *Ty = Num->getType(); +// The fractional part of a float is enough to accurately represent up to +// a 24-bit signed integer. +Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const { + int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); + if (DivBits == -1) + return nullptr; + return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); +} + +Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + unsigned DivBits, + bool IsDiv, bool IsSigned) const { Type *I32Ty = Builder.getInt32Ty(); + Num = Builder.CreateTrunc(Num, I32Ty); + Den = Builder.CreateTrunc(Den, I32Ty); + Type *F32Ty = Builder.getFloatTy(); ConstantInt *One = Builder.getInt32(1); Value *JQ = One; @@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); - Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); + Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, + Builder.getFloatTy()); + Value *RCP = Builder.CreateCall(RcpDecl, { FB }); Value *FQM = Builder.CreateFMul(FA, RCP); // fq = trunc(fqm); @@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Value *FQNeg = Builder.CreateFNeg(FQ); // float fr = mad(fqneg, fb, fa); - Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, + auto FMAD = !ST->hasMadMacF32Insts() + ? Intrinsic::fma + : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz; + Value *FR = Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); // int iq = (int)fq; @@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Res = Builder.CreateSub(Num, Rem); } - // Truncate to number of bits this divide really is. - if (IsSigned) { - Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); - Res = Builder.CreateSExt(Res, Ty); - } else { - ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); - Res = Builder.CreateAnd(Res, TruncMask); + if (DivBits != 0 && DivBits < 32) { + // Extend in register from the number of bits this divide really is. + if (IsSigned) { + int InRegBits = 32 - DivBits; + + Res = Builder.CreateShl(Res, InRegBits); + Res = Builder.CreateAShr(Res, InRegBits); + } else { + ConstantInt *TruncMask + = Builder.getInt32((UINT64_C(1) << DivBits) - 1); + Res = Builder.CreateAnd(Res, TruncMask); + } } return Res; } -Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den) const { +// Try to recognize special cases the DAG will emit special, better expansions +// than the general expansion we do here. + +// TODO: It would be better to just directly handle those optimizations here. +bool AMDGPUCodeGenPrepare::divHasSpecialOptimization( + BinaryOperator &I, Value *Num, Value *Den) const { + if (Constant *C = dyn_cast<Constant>(Den)) { + // Arbitrary constants get a better expansion as long as a wider mulhi is + // legal. + if (C->getType()->getScalarSizeInBits() <= 32) + return true; + + // TODO: Sdiv check for not exact for some reason. + + // If there's no wider mulhi, there's only a better expansion for powers of + // two. + // TODO: Should really know for each vector element. + if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT)) + return true; + + return false; + } + + if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) { + // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 + if (BinOpDen->getOpcode() == Instruction::Shl && + isa<Constant>(BinOpDen->getOperand(0)) && + isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true, + 0, AC, &I, DT)) { + return true; + } + } + + return false; +} + +static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { + // Check whether the sign can be determined statically. + KnownBits Known = computeKnownBits(V, *DL); + if (Known.isNegative()) + return Constant::getAllOnesValue(V->getType()); + if (Known.isNonNegative()) + return Constant::getNullValue(V->getType()); + return Builder.CreateAShr(V, Builder.getInt32(31)); +} + +Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, + BinaryOperator &I, Value *X, + Value *Y) const { Instruction::BinaryOps Opc = I.getOpcode(); assert(Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv); @@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, FMF.setFast(); Builder.setFastMathFlags(FMF); - if (isa<Constant>(Den)) - return nullptr; // Keep it for optimization + if (divHasSpecialOptimization(I, X, Y)) + return nullptr; // Keep it for later optimization. bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; - Type *Ty = Num->getType(); + Type *Ty = X->getType(); Type *I32Ty = Builder.getInt32Ty(); Type *F32Ty = Builder.getFloatTy(); if (Ty->getScalarSizeInBits() < 32) { if (IsSigned) { - Num = Builder.CreateSExt(Num, I32Ty); - Den = Builder.CreateSExt(Den, I32Ty); + X = Builder.CreateSExt(X, I32Ty); + Y = Builder.CreateSExt(Y, I32Ty); } else { - Num = Builder.CreateZExt(Num, I32Ty); - Den = Builder.CreateZExt(Den, I32Ty); + X = Builder.CreateZExt(X, I32Ty); + Y = Builder.CreateZExt(Y, I32Ty); } } - if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { - Res = Builder.CreateTrunc(Res, Ty); - return Res; + if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) { + return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) : + Builder.CreateZExtOrTrunc(Res, Ty); } ConstantInt *Zero = Builder.getInt32(0); ConstantInt *One = Builder.getInt32(1); - ConstantInt *MinusOne = Builder.getInt32(~0); Value *Sign = nullptr; if (IsSigned) { - ConstantInt *K31 = Builder.getInt32(31); - Value *LHSign = Builder.CreateAShr(Num, K31); - Value *RHSign = Builder.CreateAShr(Den, K31); + Value *SignX = getSign32(X, Builder, DL); + Value *SignY = getSign32(Y, Builder, DL); // Remainder sign is the same as LHS - Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; + Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX; - Num = Builder.CreateAdd(Num, LHSign); - Den = Builder.CreateAdd(Den, RHSign); + X = Builder.CreateAdd(X, SignX); + Y = Builder.CreateAdd(Y, SignY); - Num = Builder.CreateXor(Num, LHSign); - Den = Builder.CreateXor(Den, RHSign); + X = Builder.CreateXor(X, SignX); + Y = Builder.CreateXor(Y, SignY); } - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); - Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); - Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); - Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); - Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); - - // RCP_LO, RCP_HI = mul(RCP, Den) */ - Value *RCP_LO, *RCP_HI; - std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); - Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); - - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - Value *RCP_A_E = Builder.CreateAdd(RCP, E); - - // RCP_S_E = RCP - E - Value *RCP_S_E = Builder.CreateSub(RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); - - // Quotient = mulhu(Tmp0, Num) - Value *Quotient = getMulHu(Builder, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); + // The algorithm here is based on ideas from "Software Integer Division", Tom + // Rodeheffer, August 2008. + // + // unsigned udiv(unsigned x, unsigned y) { + // // Initial estimate of inv(y). The constant is less than 2^32 to ensure + // // that this is a lower bound on inv(y), even if some of the calculations + // // round up. + // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y)); + // + // // One round of UNR (Unsigned integer Newton-Raphson) to improve z. + // // Empirically this is guaranteed to give a "two-y" lower bound on + // // inv(y). + // z += umulh(z, -y * z); + // + // // Quotient/remainder estimate. + // unsigned q = umulh(x, z); + // unsigned r = x - q * y; + // + // // Two rounds of quotient/remainder refinement. + // if (r >= y) { + // ++q; + // r -= y; + // } + // if (r >= y) { + // ++q; + // r -= y; + // } + // + // return q; + // } + + // Initial estimate of inv(y). + Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); + Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); + Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); + Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE)); + Value *ScaledY = Builder.CreateFMul(RcpY, Scale); + Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); + + // One round of UNR. + Value *NegY = Builder.CreateSub(Zero, Y); + Value *NegYZ = Builder.CreateMul(NegY, Z); + Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ)); + + // Quotient/remainder estimate. + Value *Q = getMulHu(Builder, X, Z); + Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y)); + + // First quotient/remainder refinement. + Value *Cond = Builder.CreateICmpUGE(R, Y); + if (IsDiv) + Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); + R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); + + // Second quotient/remainder refinement. + Cond = Builder.CreateICmpUGE(R, Y); + Value *Res; + if (IsDiv) + Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q); + else + Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R); - // Remainder = Num - Num_S_Remainder - Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); + if (IsSigned) { + Res = Builder.CreateXor(Res, Sign); + Res = Builder.CreateSub(Res, Sign); + } - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); - Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); + Res = Builder.CreateTrunc(Res, Ty); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); - Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, - MinusOne, Zero); + return Res; +} - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); - Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); +Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den) const { + if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) + return nullptr; // Keep it for later optimization. - Value *Res; - if (IsDiv) { - // Quotient_A_One = Quotient + 1 - Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); + Instruction::BinaryOps Opc = I.getOpcode(); - // Quotient_S_One = Quotient - 1 - Value *Quotient_S_One = Builder.CreateSub(Quotient, One); + bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv; + bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem; - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); + int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned); + if (NumDivBits == -1) + return nullptr; - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); - } else { - // Remainder_S_Den = Remainder - Den - Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); + Value *Narrowed = nullptr; + if (NumDivBits <= 24) { + Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits, + IsDiv, IsSigned); + } else if (NumDivBits <= 32) { + Narrowed = expandDivRem32(Builder, I, Num, Den); + } - // Remainder_A_Den = Remainder + Den - Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); + if (Narrowed) { + return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) : + Builder.CreateZExt(Narrowed, Num->getType()); + } - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); + return nullptr; +} - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); +void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { + Instruction::BinaryOps Opc = I.getOpcode(); + // Do the general expansion. + if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { + expandDivisionUpTo64Bits(&I); + return; } - if (IsSigned) { - Res = Builder.CreateXor(Res, Sign); - Res = Builder.CreateSub(Res, Sign); + if (Opc == Instruction::URem || Opc == Instruction::SRem) { + expandRemainderUpTo64Bits(&I); + return; } - Res = Builder.CreateTrunc(Res, Ty); - - return Res; + llvm_unreachable("not a division"); } bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + if (foldBinOpIntoSelect(I)) + return true; + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; @@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; + unsigned ScalarSize = Ty->getScalarSizeInBits(); + + SmallVector<BinaryOperator *, 8> Div64ToExpand; + if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && - Ty->getScalarSizeInBits() <= 32) { + ScalarSize <= 64 && + !DisableIDivExpand) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); - if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); - Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); - if (!NewElt) - NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + + Value *NewElt; + if (ScalarSize <= 32) { + NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); + if (!NewElt) + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + } else { + // See if this 64-bit division can be shrunk to 32/24-bits before + // producing the general expansion. + NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN); + if (!NewElt) { + // The general 64-bit expansion introduces control flow and doesn't + // return the new value. Just insert a scalar copy and defer + // expanding it. + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + Div64ToExpand.push_back(cast<BinaryOperator>(NewElt)); + } + } + NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { - NewDiv = expandDivRem32(Builder, I, Num, Den); + if (ScalarSize <= 32) + NewDiv = expandDivRem32(Builder, I, Num, Den); + else { + NewDiv = shrinkDivRem64(Builder, I, Num, Den); + if (!NewDiv) + Div64ToExpand.push_back(&I); + } } if (NewDiv) { @@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { } } + if (ExpandDiv64InIR) { + // TODO: We get much worse code in specially handled constant cases. + for (BinaryOperator *Div : Div64ToExpand) { + expandDivRem64(*Div); + Changed = true; + } + } + return Changed; } @@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { ST = &TM.getSubtarget<GCNSubtarget>(F); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DA = &getAnalysis<LegacyDivergenceAnalysis>(); + + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + HasUnsafeFPMath = hasUnsafeFPMath(F); - HasFP32Denormals = ST->hasFP32Denormals(F); + + AMDGPU::SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.allFP32Denormals(); bool MadeChange = false; - for (BasicBlock &BB : F) { + Function::iterator NextBB; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { + BasicBlock *BB = &*FI; + NextBB = std::next(FI); + BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) { Next = std::next(I); + MadeChange |= visit(*I); + + if (Next != E) { // Control flow changed + BasicBlock *NextInstBB = Next->getParent(); + if (NextInstBB != BB) { + BB = NextInstBB; + E = BB->end(); + FE = F.end(); + } + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td new file mode 100644 index 0000000000000..faaf9168d0dd8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -0,0 +1,69 @@ +//=- AMDGPUCombine.td - Define AMDGPU Combine Rules ----------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +// TODO: This really belongs after legalization after scalarization. +// TODO: GICombineRules should accept subtarget predicates + +def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">; + +def fcmp_select_to_fmin_fmax_legacy : GICombineRule< + (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo), + (match (wip_match_opcode G_SELECT):$select, + [{ return matchFMinFMaxLegacy(*${select}, MRI, *MF, ${matchinfo}); }]), + (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>; + + +def uchar_to_float : GICombineRule< + (defs root:$itofp), + (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp, + [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]), + (apply [{ applyUCharToFloat(*${itofp}); }])>; + +def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">; + +def cvt_f32_ubyteN : GICombineRule< + (defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo), + (match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0, + G_AMDGPU_CVT_F32_UBYTE1, + G_AMDGPU_CVT_F32_UBYTE2, + G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN, + [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]), + (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; + +// Combines which should only apply on SI/VI +def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; + + +def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, + elide_br_by_inverting_cond]> { + let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; +} + + +// FIXME: combines_for_extload can introduce illegal extloads which +// aren't re-legalized. +// FIXME: Is there a way to remove a single item from all_combines? +def all_combines_minus_extload : GICombineGroup<[trivial_combines, + ptr_add_immed_chain, combine_indexed_load_store, undef_combines, + identity_combines] +>; + +def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< + "AMDGPUGenPostLegalizerCombinerHelper", + [all_combines_minus_extload, gfx6gfx7_combines, + uchar_to_float, cvt_f32_ubyteN]> { + let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; +} + +def AMDGPURegBankCombinerHelper : GICombinerHelper< + "AMDGPUGenRegBankCombinerHelper", []> { + let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp new file mode 100644 index 0000000000000..25c82ed61fc2e --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp @@ -0,0 +1,150 @@ +//===--- AMDGPUExportClusting.cpp - AMDGPU Export Clustering -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to cluster shader +/// exports. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUExportClustering.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" + +using namespace llvm; + +namespace { + +class ExportClustering : public ScheduleDAGMutation { +public: + ExportClustering() {} + void apply(ScheduleDAGInstrs *DAG) override; +}; + +static bool isExport(const SUnit &SU) { + const MachineInstr *MI = SU.getInstr(); + return MI->getOpcode() == AMDGPU::EXP || + MI->getOpcode() == AMDGPU::EXP_DONE; +} + +static bool isPositionExport(const SIInstrInfo *TII, SUnit *SU) { + const MachineInstr *MI = SU->getInstr(); + int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm(); + return Imm >= 12 && Imm <= 15; +} + +static void sortChain(const SIInstrInfo *TII, SmallVector<SUnit *, 8> &Chain, + unsigned PosCount) { + if (!PosCount || PosCount == Chain.size()) + return; + + // Position exports should occur as soon as possible in the shader + // for optimal performance. This moves position exports before + // other exports while preserving the order within different export + // types (pos or other). + SmallVector<SUnit *, 8> Copy(Chain); + unsigned PosIdx = 0; + unsigned OtherIdx = PosCount; + for (SUnit *SU : Copy) { + if (isPositionExport(TII, SU)) + Chain[PosIdx++] = SU; + else + Chain[OtherIdx++] = SU; + } +} + +static void buildCluster(ArrayRef<SUnit *> Exports, ScheduleDAGInstrs *DAG) { + SUnit *ChainHead = Exports.front(); + + // Now construct cluster from chain by adding new edges. + for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) { + SUnit *SUa = Exports[Idx]; + SUnit *SUb = Exports[Idx + 1]; + + // Copy all dependencies to the head of the chain to avoid any + // computation being inserted into the chain. + for (const SDep &Pred : SUb->Preds) { + SUnit *PredSU = Pred.getSUnit(); + if (!isExport(*PredSU) && !Pred.isWeak()) + DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial)); + } + + // New barrier edge ordering exports + DAG->addEdge(SUb, SDep(SUa, SDep::Barrier)); + // Also add cluster edge + DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)); + } +} + +static void removeExportDependencies(ScheduleDAGInstrs *DAG, SUnit &SU) { + SmallVector<SDep, 2> ToAdd, ToRemove; + + for (const SDep &Pred : SU.Preds) { + SUnit *PredSU = Pred.getSUnit(); + if (Pred.isBarrier() && isExport(*PredSU)) { + ToRemove.push_back(Pred); + if (isExport(SU)) + continue; + + // If we remove a barrier we need to copy dependencies + // from the predecessor to maintain order. + for (const SDep &ExportPred : PredSU->Preds) { + SUnit *ExportPredSU = ExportPred.getSUnit(); + if (ExportPred.isBarrier() && !isExport(*ExportPredSU)) + ToAdd.push_back(SDep(ExportPredSU, SDep::Barrier)); + } + } + } + + for (SDep Pred : ToRemove) + SU.removePred(Pred); + for (SDep Pred : ToAdd) + DAG->addEdge(&SU, Pred); +} + +void ExportClustering::apply(ScheduleDAGInstrs *DAG) { + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII); + + SmallVector<SUnit *, 8> Chain; + + // Pass through DAG gathering a list of exports and removing barrier edges + // creating dependencies on exports. Freeing exports of successor edges + // allows more scheduling freedom, and nothing should be order dependent + // on exports. Edges will be added later to order the exports. + unsigned PosCount = 0; + for (SUnit &SU : DAG->SUnits) { + if (!isExport(SU)) + continue; + + Chain.push_back(&SU); + if (isPositionExport(TII, &SU)) + PosCount++; + + removeExportDependencies(DAG, SU); + + SmallVector<SDep, 4> Succs(SU.Succs); + for (SDep Succ : Succs) + removeExportDependencies(DAG, *Succ.getSUnit()); + } + + // Apply clustering if there are multiple exports + if (Chain.size() > 1) { + sortChain(TII, Chain, PosCount); + buildCluster(Chain, DAG); + } +} + +} // end namespace + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() { + return std::make_unique<ExportClustering>(); +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h new file mode 100644 index 0000000000000..58491d0671e4c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h @@ -0,0 +1,15 @@ +//===- AMDGPUExportClustering.h - AMDGPU Export Clustering ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation(); + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index ea3952c316e4d..db00f8f711a33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -18,15 +18,6 @@ def FeatureFMA : SubtargetFeature<"fmaf", "Enable single precision FMA (not as fast as mul+add, but fused)" >; -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling" ->; - class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", @@ -38,16 +29,16 @@ def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; -class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast<string>(Value), +class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature< + "wavefrontsize"#!shl(1, ValueLog2), + "WavefrontSizeLog2", + !cast<string>(ValueLog2), "The number of threads per wavefront" >; -def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; -def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; -def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<4>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<5>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<6>; class SubtargetFeatureGeneration <string Value, string FeatureName, string Subtarget, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp index 9ba04d113c702..ea6c6d0fd212b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/Pass.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" using namespace llvm; @@ -31,12 +32,13 @@ class AMDGPUFixFunctionBitcasts final bool Modified; public: - void visitCallSite(CallSite CS) { - if (CS.getCalledFunction()) + void visitCallBase(CallBase &CB) { + if (CB.getCalledFunction()) return; - auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); - if (Callee && isLegalToPromote(CS, Callee)) { - promoteCall(CS, Callee); + auto *Callee = + dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts()); + if (Callee && isLegalToPromote(CB, Callee)) { + promoteCall(CB, Callee); Modified = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 92e256cf2829f..260a18e278cf2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -26,7 +26,7 @@ namespace llvm { class AMDGPUFrameLowering : public TargetFrameLowering { public: AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO, - Align TransAl = Align::None()); + Align TransAl = Align(1)); ~AMDGPUFrameLowering() override; /// \returns The number of 32-bit sub-registers that are used when storing diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index d420aa02ac28b..3f12addbcc79b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// include "AMDGPU.td" +include "AMDGPUCombine.td" def sd_vsrc0 : ComplexPattern<i32, 1, "">; def gi_vsrc0 : @@ -30,6 +31,10 @@ def gi_vop3mods : GIComplexOperandMatcher<s32, "selectVOP3Mods">, GIComplexPatternEquiv<VOP3Mods>; +def gi_vop3_no_mods : + GIComplexOperandMatcher<s32, "selectVOP3NoMods">, + GIComplexPatternEquiv<VOP3NoMods>; + def gi_vop3mods_nnan : GIComplexOperandMatcher<s32, "selectVOP3Mods_nnan">, GIComplexPatternEquiv<VOP3Mods_nnan>; @@ -38,9 +43,9 @@ def gi_vop3omods : GIComplexOperandMatcher<s32, "selectVOP3OMods">, GIComplexPatternEquiv<VOP3OMods>; -def gi_vop3opselmods0 : - GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">, - GIComplexPatternEquiv<VOP3OpSelMods0>; +def gi_vop3pmods : + GIComplexOperandMatcher<s32, "selectVOP3PMods">, + GIComplexPatternEquiv<VOP3PMods>; def gi_vop3opselmods : GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, @@ -83,6 +88,33 @@ def gi_ds_1addr_1offset : GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">, GIComplexPatternEquiv<DS1Addr1Offset>; +def gi_ds_64bit_4byte_aligned : + GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">, + GIComplexPatternEquiv<DS64Bit4ByteAligned>; + +def gi_mubuf_addr64 : + GIComplexOperandMatcher<s64, "selectMUBUFAddr64">, + GIComplexPatternEquiv<MUBUFAddr64>; + +def gi_mubuf_offset : + GIComplexOperandMatcher<s64, "selectMUBUFOffset">, + GIComplexPatternEquiv<MUBUFOffset>; + +def gi_mubuf_addr64_atomic : + GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">, + GIComplexPatternEquiv<MUBUFAddr64Atomic>; + +def gi_mubuf_offset_atomic : + GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">, + GIComplexPatternEquiv<MUBUFOffsetAtomic>; + +def gi_smrd_buffer_imm : + GIComplexOperandMatcher<s64, "selectSMRDBufferImm">, + GIComplexPatternEquiv<SMRDBufferImm>; + +def gi_smrd_buffer_imm32 : + GIComplexOperandMatcher<s64, "selectSMRDBufferImm32">, + GIComplexPatternEquiv<SMRDBufferImm32>; // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization @@ -116,9 +148,54 @@ def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>; def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>; def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>; -def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>; +def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32_impl>; +def : GINodeEquiv<G_AMDGPU_FMIN_LEGACY, AMDGPUfmin_legacy>; +def : GINodeEquiv<G_AMDGPU_FMAX_LEGACY, AMDGPUfmax_legacy>; +def : GINodeEquiv<G_AMDGPU_RCP_IFLAG, AMDGPUrcp_iflag>; +def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE0, AMDGPUcvt_f32_ubyte0>; +def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>; +def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>; +def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; + +def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>; +def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>; +def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>; +def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>; +def : GINodeEquiv<G_AMDGPU_BUFFER_STORE, SIbuffer_store>; +def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_SHORT, SIbuffer_store_short>; +def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_BYTE, SIbuffer_store_byte>; +def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT, SIbuffer_store_format>; +def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT_D16, SIbuffer_store_format_d16>; +def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT, SItbuffer_store>; +def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; + +// FIXME: Check MMO is atomic +def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>; +def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>; +def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>; +def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>; + +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB, SIbuffer_atomic_sub>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMIN, SIbuffer_atomic_smin>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMIN, SIbuffer_atomic_umin>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMAX, SIbuffer_atomic_smax>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMAX, SIbuffer_atomic_umax>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_AND, SIbuffer_atomic_and>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; +def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; class GISelSop2Pat < SDPatternOperator node, @@ -188,16 +265,13 @@ multiclass GISelVop2IntrPat < def : GISelVop2Pat <node, inst, dst_vt, src_vt>; - // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit + // FIXME: Intrinsics aren't marked as commutable, so we need to add an explicit // pattern to handle commuting. This is another reason why legalizing to a // generic machine instruction may be better that matching the intrinsic // directly. def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>; } -def : GISelSop2Pat <or, S_OR_B32, i32>; -def : GISelVop2Pat <or, V_OR_B32_e32, i32>; - // Since GlobalISel is more flexible then SelectionDAG, I think we can get // away with adding patterns for integer types and not legalizing all // loads and stores to vector types. This should help simplify the load/store @@ -206,12 +280,18 @@ foreach Ty = [i64, p0, p1, p4] in { defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } -def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">, +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">, GISDNodeXFormEquiv<as_i32timm>; -def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">, +def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">, GISDNodeXFormEquiv<as_i16timm>; +def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">, + GISDNodeXFormEquiv<as_i8timm>; + +def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">, + GISDNodeXFormEquiv<as_i1timm>; + def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">, GISDNodeXFormEquiv<NegateImm>; @@ -220,3 +300,15 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">, def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">, GISDNodeXFormEquiv<IMMPopCount>; + +def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">, + GISDNodeXFormEquiv<extract_glc>; + +def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">, + GISDNodeXFormEquiv<extract_slc>; + +def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">, + GISDNodeXFormEquiv<extract_dlc>; + +def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, + GISDNodeXFormEquiv<extract_swz>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 2e92ae51660b7..600b351f9ea1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -132,7 +132,8 @@ const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { }; -// For some instructions which can operate 64-bit only for the scalar version. +// For some instructions which can operate 64-bit only for the scalar +// version. Otherwise, these need to be split into 2 32-bit operations. const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] { /*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1}, /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2}, @@ -207,75 +208,16 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID, return &ValMappingsSGPR64OnlyVGPR32[2]; } -const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { - /* 256-bit load */ {0, 256, SGPRRegBank}, - /* 512-bit load */ {0, 512, SGPRRegBank}, - /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, - {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, - {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, - {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, - /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, - {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, - {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, - {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, - {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, - {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, - {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, - {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, - /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, - {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, - /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, - {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, - {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, - {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, - - /* FIXME: The generic register bank select does not support complex - * break downs where the number of vector elements does not equal the - * number of breakdowns. - * FIXME: register bank select now tries to handle complex break downs, - * but it emits an illegal instruction: - * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128) - */ - /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, - /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, - {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} -}; - -const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { - /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, - /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, - /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, - /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, - /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, - /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} -}; - -const RegisterBankInfo::ValueMapping * -getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { - unsigned Size = SizeTy.getSizeInBits(); - if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) - return getValueMapping(BankID, Size); - - assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); - - // Default to using the non-split ValueMappings, we will use these if - // the register bank is SGPR or if we don't know how to handle the vector - // type. - unsigned Idx = Size == 256 ? 0 : 1; - - // We need to split this load if it has a vgpr pointer. - if (BankID == AMDGPU::VGPRRegBankID) { - if (SizeTy == LLT::vector(8, 32)) - Idx = 2; - else if (SizeTy == LLT::vector(16, 32)) - Idx = 3; - else if (SizeTy == LLT::vector(4, 64)) - Idx = 4; - else if (SizeTy == LLT::vector(8, 64)) - Idx = 5; - } +/// Split any 64-bit value into 2 32-bit pieces. Unlike +/// getValueMappingSGPR64Only, this splits both VGPRs and SGPRs. +const RegisterBankInfo::ValueMapping *getValueMappingSplit64(unsigned BankID, + unsigned Size) { + assert(Size == 64); + if (BankID == AMDGPU::VGPRRegBankID) + return &ValMappingsSGPR64OnlyVGPR32[4]; - return &ValMappingsLoadSGPROnly[Idx]; + assert(BankID == AMDGPU::SGPRRegBankID); + return &ValMappingsSGPR64OnlyVGPR32[1]; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 16d7f2c4f9e59..989937a597fb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -43,3 +43,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { return std::make_tuple(Reg, 0, Def); } + +bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { + assert(Mask.size() == 2); + + // If one half is undef, the other is trivially in the same reg. + if (Mask[0] == -1 || Mask[1] == -1) + return true; + return (Mask[0] & 2) == (Mask[1] & 2); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 1507ade795479..766750758efc2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "AMDGPUInstrInfo.h" #include "llvm/CodeGen/Register.h" #include <tuple> @@ -23,6 +24,38 @@ namespace AMDGPU { std::tuple<Register, unsigned, MachineInstr *> getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg); +bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask); + +/// Return number of address arguments, and the number of gradients for an image +/// intrinsic. +inline std::pair<int, int> +getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { + const AMDGPU::MIMGDimInfo *DimInfo + = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); + + int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; + int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; + int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; + int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; + return {NumVAddr, NumGradients}; +} + +/// Return index of dmask in an gMIR image intrinsic +inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, + int NumDefs) { + assert(!BaseOpcode->Atomic); + return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); +} + +/// Return first address operand index in a gMIR image intrinsic. +inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, + int NumDefs) { + if (BaseOpcode->Atomic) + return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); + return getDMaskIdx(BaseOpcode, NumDefs) + 1; +} + } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 511d62943189d..c6f6a3b84e367 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -47,7 +47,7 @@ void MetadataStreamerV2::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; HSAMD::Metadata FromHSAMetadataString; - if (fromString(HSAMetadataString, FromHSAMetadataString)) { + if (fromString(std::string(HSAMetadataString), FromHSAMetadataString)) { errs() << "FAIL\n"; return; } @@ -127,38 +127,6 @@ ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual, ValueKind::ByValue); } -ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const { - switch (Ty->getTypeID()) { - case Type::IntegerTyID: { - auto Signed = !TypeName.startswith("u"); - switch (Ty->getIntegerBitWidth()) { - case 8: - return Signed ? ValueType::I8 : ValueType::U8; - case 16: - return Signed ? ValueType::I16 : ValueType::U16; - case 32: - return Signed ? ValueType::I32 : ValueType::U32; - case 64: - return Signed ? ValueType::I64 : ValueType::U64; - default: - return ValueType::Struct; - } - } - case Type::HalfTyID: - return ValueType::F16; - case Type::FloatTyID: - return ValueType::F32; - case Type::DoubleTyID: - return ValueType::F64; - case Type::PointerTyID: - return getValueType(Ty->getPointerElementType(), TypeName); - case Type::VectorTyID: - return getValueType(Ty->getVectorElementType(), TypeName); - default: - return ValueType::Struct; - } -} - std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { @@ -185,10 +153,10 @@ std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const { return "float"; case Type::DoubleTyID: return "double"; - case Type::VectorTyID: { - auto VecTy = cast<VectorType>(Ty); + case Type::FixedVectorTyID: { + auto VecTy = cast<FixedVectorType>(Ty); auto ElTy = VecTy->getElementType(); - auto NumElements = VecTy->getVectorNumElements(); + auto NumElements = VecTy->getNumElements(); return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); } default: @@ -259,7 +227,8 @@ void MetadataStreamerV2::emitPrintf(const Module &Mod) { for (auto Op : Node->operands()) if (Op->getNumOperands()) - Printf.push_back(cast<MDString>(Op->getOperand(0))->getString()); + Printf.push_back( + std::string(cast<MDString>(Op->getOperand(0))->getString())); } void MetadataStreamerV2::emitKernelLanguage(const Function &Func) { @@ -345,12 +314,11 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); - unsigned PointeeAlign = 0; + MaybeAlign PointeeAlign; if (auto PtrTy = dyn_cast<PointerType>(Ty)) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - PointeeAlign = Arg.getParamAlignment(); - if (PointeeAlign == 0) - PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); + PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), + PtrTy->getElementType()); } } @@ -360,20 +328,19 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, - unsigned PointeeAlign, StringRef Name, + MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); auto &Arg = HSAMetadata.mKernels.back().mArgs.back(); - Arg.mName = Name; - Arg.mTypeName = TypeName; + Arg.mName = std::string(Name); + Arg.mTypeName = std::string(TypeName); Arg.mSize = DL.getTypeAllocSize(Ty); - Arg.mAlign = DL.getABITypeAlignment(Ty); + Arg.mAlign = DL.getABITypeAlign(Ty).value(); Arg.mValueKind = ValueKind; - Arg.mValueType = getValueType(Ty, BaseTypeName); - Arg.mPointeeAlign = PointeeAlign; + Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0; if (auto PtrTy = dyn_cast<PointerType>(Ty)) Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace()); @@ -479,7 +446,7 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF, HSAMetadata.mKernels.push_back(Kernel::Metadata()); auto &Kernel = HSAMetadata.mKernels.back(); - Kernel.mName = Func.getName(); + Kernel.mName = std::string(Func.getName()); Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); emitKernelLanguage(Func); emitKernelAttrs(Func); @@ -573,38 +540,6 @@ StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual, : "by_value"); } -StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const { - switch (Ty->getTypeID()) { - case Type::IntegerTyID: { - auto Signed = !TypeName.startswith("u"); - switch (Ty->getIntegerBitWidth()) { - case 8: - return Signed ? "i8" : "u8"; - case 16: - return Signed ? "i16" : "u16"; - case 32: - return Signed ? "i32" : "u32"; - case 64: - return Signed ? "i64" : "u64"; - default: - return "struct"; - } - } - case Type::HalfTyID: - return "f16"; - case Type::FloatTyID: - return "f32"; - case Type::DoubleTyID: - return "f64"; - case Type::PointerTyID: - return getValueType(Ty->getPointerElementType(), TypeName); - case Type::VectorTyID: - return getValueType(Ty->getVectorElementType(), TypeName); - default: - return "struct"; - } -} - std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { @@ -631,10 +566,10 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { return "float"; case Type::DoubleTyID: return "double"; - case Type::VectorTyID: { - auto VecTy = cast<VectorType>(Ty); + case Type::FixedVectorTyID: { + auto VecTy = cast<FixedVectorType>(Ty); auto ElTy = VecTy->getElementType(); - auto NumElements = VecTy->getVectorNumElements(); + auto NumElements = VecTy->getNumElements(); return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); } default: @@ -767,12 +702,11 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); - unsigned PointeeAlign = 0; + MaybeAlign PointeeAlign; if (auto PtrTy = dyn_cast<PointerType>(Ty)) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - PointeeAlign = Arg.getParamAlignment(); - if (PointeeAlign == 0) - PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); + PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), + PtrTy->getElementType()); } } @@ -785,7 +719,7 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, - unsigned PointeeAlign, StringRef Name, + MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { @@ -796,16 +730,14 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, if (!TypeName.empty()) Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); - auto Align = DL.getABITypeAlignment(Ty); + Align Alignment = DL.getABITypeAlign(Ty); Arg[".size"] = Arg.getDocument()->getNode(Size); - Offset = alignTo(Offset, Align); + Offset = alignTo(Offset, Alignment); Arg[".offset"] = Arg.getDocument()->getNode(Offset); Offset += Size; Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true); - Arg[".value_type"] = - Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true); if (PointeeAlign) - Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign); + Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign->value()); if (auto PtrTy = dyn_cast<PointerType>(Ty)) if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace())) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 80ac8ca67bcd1..9534fffd228d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -20,6 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/Alignment.h" namespace llvm { @@ -27,6 +28,7 @@ class AMDGPUTargetStreamer; class Argument; class DataLayout; class Function; +class MachineFunction; class MDNode; class Module; struct SIProgramInfo; @@ -65,8 +67,6 @@ private: StringRef getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const; - StringRef getValueType(Type *Ty, StringRef TypeName) const; - std::string getTypeName(Type *Ty, bool Signed) const; msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const; @@ -89,7 +89,7 @@ private: void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, - unsigned PointeeAlign = 0, StringRef Name = "", + MaybeAlign PointeeAlign = None, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); @@ -133,8 +133,6 @@ private: ValueKind getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const; - ValueType getValueType(Type *Ty, StringRef TypeName) const; - std::string getTypeName(Type *Ty, bool Signed) const; std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const; @@ -159,10 +157,9 @@ private: void emitKernelArg(const Argument &Arg); void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, - unsigned PointeeAlign = 0, - StringRef Name = "", StringRef TypeName = "", - StringRef BaseTypeName = "", StringRef AccQual = "", - StringRef TypeQual = ""); + MaybeAlign PointeeAlign = None, StringRef Name = "", + StringRef TypeName = "", StringRef BaseTypeName = "", + StringRef AccQual = "", StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2b6308dc1549e..aaf448346b533 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -16,7 +16,6 @@ #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" #include "AMDGPUPerfHintAnalysis.h" -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -29,6 +28,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -252,7 +252,6 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; @@ -265,16 +264,10 @@ private: SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp) const; bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -286,7 +279,6 @@ private: void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); - void SelectDIV_FMAS(SDNode *N); void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -301,6 +293,7 @@ private: void SelectATOMIC_CMP_SWAP(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); + void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); void SelectINTRINSIC_WO_CHAIN(SDNode *N); void SelectINTRINSIC_VOID(SDNode *N); @@ -409,7 +402,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { } #endif Subtarget = &MF.getSubtarget<GCNSubtarget>(); - Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); + Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction()); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -655,29 +648,6 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } -static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { - switch (NumVectorElts) { - case 1: - return AMDGPU::SReg_32RegClassID; - case 2: - return AMDGPU::SReg_64RegClassID; - case 3: - return AMDGPU::SGPR_96RegClassID; - case 4: - return AMDGPU::SGPR_128RegClassID; - case 5: - return AMDGPU::SGPR_160RegClassID; - case 8: - return AMDGPU::SReg_256RegClassID; - case 16: - return AMDGPU::SReg_512RegClassID; - case 32: - return AMDGPU::SReg_1024RegClassID; - } - - llvm_unreachable("invalid vector size"); -} - void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); @@ -698,6 +668,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { // 1 = Vector Register Class SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() == + Triple::amdgcn; RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; unsigned NOps = N->getNumOperands(); @@ -707,7 +679,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { IsRegSeq = false; break; } - unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); + unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) + : R600RegisterInfo::getSubRegFromChannel(i); RegSeqArgs[1 + (2 * i)] = N->getOperand(i); RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); } @@ -717,7 +690,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT); for (unsigned i = NOps; i < NumVectorElts; ++i) { - unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); + unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) + : R600RegisterInfo::getSubRegFromChannel(i); RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); @@ -742,7 +716,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX || + Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; @@ -801,7 +776,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } assert(VT.getVectorElementType().bitsEq(MVT::i32)); - unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); + unsigned RegClassID = + SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); SelectBuildVector(N, RegClassID); return; } @@ -874,10 +850,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } - case AMDGPUISD::DIV_FMAS: { - SelectDIV_FMAS(N); - return; - } case AMDGPUISD::MAD_I64_I32: case AMDGPUISD::MAD_U64_U32: { SelectMAD_64_32(N); @@ -1020,8 +992,14 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + static const unsigned OpcMap[2][2][2] = { + {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32}, + {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}}, + {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32}, + {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}}; + + unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd]; + unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd]; SDNode *AddLo; if (!ConsumeCarry) { @@ -1063,24 +1041,51 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { SDValue RHS = N->getOperand(1); SDValue CI = N->getOperand(2); - unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; - CurDAG->SelectNodeTo( - N, Opc, N->getVTList(), - {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + if (N->isDivergent()) { + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {LHS, RHS, CI, + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + } else { + unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO + : AMDGPU::S_SUB_CO_PSEUDO; + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); + } } void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned // carry out despite the _i32 name. These were renamed in VI to _U32. // FIXME: We should probably rename the opcodes here. - unsigned Opc = N->getOpcode() == ISD::UADDO ? - AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + bool IsAdd = N->getOpcode() == ISD::UADDO; + bool IsVALU = N->isDivergent(); + + for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; + ++UI) + if (UI.getUse().getResNo() == 1) { + if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) || + (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) { + IsVALU = true; + break; + } + } + + if (IsVALU) { + unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + + CurDAG->SelectNodeTo( + N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + } else { + unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO + : AMDGPU::S_USUBO_PSEUDO; - CurDAG->SelectNodeTo( - N, Opc, N->getVTList(), - {N->getOperand(0), N->getOperand(1), - CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), + {N->getOperand(0), N->getOperand(1)}); + } } void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { @@ -1125,35 +1130,6 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) { - const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget); - const SIRegisterInfo *TRI = ST->getRegisterInfo(); - - SDLoc SL(N); - EVT VT = N->getValueType(0); - - assert(VT == MVT::f32 || VT == MVT::f64); - - unsigned Opc - = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32; - - SDValue CarryIn = N->getOperand(3); - // V_DIV_FMAS implicitly reads VCC. - SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL, - TRI->getVCC(), CarryIn, SDValue()); - - SDValue Ops[10]; - - SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); - SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); - SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - - Ops[8] = VCC; - Ops[9] = VCC.getValue(1); - - CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); -} - // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { @@ -1343,6 +1319,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &TFE, SDValue &DLC, SDValue &SWZ) const { // Subtarget prefers to use flat instruction + // FIXME: This should be a pattern predicate and not reach here if (Subtarget->useFlatForGlobal()) return false; @@ -1438,6 +1415,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. + // FIXME: This should be a pattern predicate and not reach here if (!Subtarget->hasAddr64()) return false; @@ -1475,6 +1453,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { } std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + SDLoc DL(N); const MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); @@ -1489,9 +1468,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const } // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), - MVT::i32)); + // be relative to the entry point's scratch wave offset. + return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, @@ -1506,22 +1484,26 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { - unsigned Imm = CAddr->getZExtValue(); - - SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); - MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, HighBits); - VAddr = SDValue(MovHighBits, 0); - - // In a call sequence, stores to the argument stack area are relative to the - // stack pointer. - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); - - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); - return true; + int64_t Imm = CAddr->getSExtValue(); + const int64_t NullPtr = + AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS); + // Don't fold null pointer. + if (Imm != NullPtr) { + SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + MachineSDNode *MovHighBits = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); + VAddr = SDValue(MovHighBits, 0); + + // In a call sequence, stores to the argument stack area are relative to the + // stack pointer. + const MachinePointerInfo &PtrInfo + = cast<MemSDNode>(Parent)->getPointerInfo(); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + return true; + } } if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -1577,12 +1559,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); // FIXME: Get from MachinePointerInfo? We should only be using the frame // offset if we know this is in a call sequence. - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; @@ -1646,6 +1628,37 @@ static MemSDNode* findMemSDNode(SDNode *N) { llvm_unreachable("cannot find MemSDNode in the pattern!"); } +static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, + SDValue &N0, SDValue &N1) { + if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && + Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e. + // (i64 (bitcast (v2i32 (build_vector + // (or (extract_vector_elt V, 0), OFFSET), + // (extract_vector_elt V, 1))))) + SDValue Lo = Addr.getOperand(0).getOperand(0); + if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) { + SDValue BaseLo = Lo.getOperand(0); + SDValue BaseHi = Addr.getOperand(0).getOperand(1); + // Check that split base (Lo and Hi) are extracted from the same one. + if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseLo.getOperand(0) == BaseHi.getOperand(0) && + // Lo is statically extracted from index 0. + isa<ConstantSDNode>(BaseLo.getOperand(1)) && + BaseLo.getConstantOperandVal(1) == 0 && + // Hi is statically extracted from index 0. + isa<ConstantSDNode>(BaseHi.getOperand(1)) && + BaseHi.getConstantOperandVal(1) == 1) { + N0 = BaseLo.getOperand(0).getOperand(0); + N1 = Lo.getOperand(1); + return true; + } + } + } + return false; +} + template <bool IsSigned> bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, @@ -1656,84 +1669,91 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, if (Subtarget->hasFlatInstOffsets() && (!Subtarget->hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && - CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); - - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned AS = findMemSDNode(N)->getAddressSpace(); - if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - - SDLoc DL(N); - uint64_t ImmField; - const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); - if (IsSigned) { - ImmField = SignExtend64(COffsetVal, NumBits); - - // Don't use a negative offset field if the base offset is positive. - // Since the scheduler currently relies on the offset field, doing so - // could result in strange scheduling decisions. - - // TODO: Should we not do this in the opposite direction as well? - if (static_cast<int64_t>(COffsetVal) > 0) { - if (static_cast<int64_t>(ImmField) < 0) { - const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1); - ImmField = COffsetVal & OffsetMask; + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) { + SDValue N0, N1; + if (CurDAG->isBaseWithConstantOffset(Addr)) { + N0 = Addr.getOperand(0); + N1 = Addr.getOperand(1); + } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { + assert(N0 && N1 && isa<ConstantSDNode>(N1)); + } + if (N0 && N1) { + uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } else { + // If the offset doesn't fit, put the low bits into the offset field and + // add the rest. + + SDLoc DL(N); + uint64_t ImmField; + const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); + if (IsSigned) { + ImmField = SignExtend64(COffsetVal, NumBits); + + // Don't use a negative offset field if the base offset is positive. + // Since the scheduler currently relies on the offset field, doing so + // could result in strange scheduling decisions. + + // TODO: Should we not do this in the opposite direction as well? + if (static_cast<int64_t>(COffsetVal) > 0) { + if (static_cast<int64_t>(ImmField) < 0) { + const uint64_t OffsetMask = + maskTrailingOnes<uint64_t>(NumBits - 1); + ImmField = COffsetVal & OffsetMask; + } } + } else { + // TODO: Should we do this for a negative offset? + const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); + ImmField = COffsetVal & OffsetMask; } - } else { - // TODO: Should we do this for a negative offset? - const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); - ImmField = COffsetVal & OffsetMask; - } - uint64_t RemainderOffset = COffsetVal - ImmField; + uint64_t RemainderOffset = COffsetVal - ImmField; - assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); - assert(RemainderOffset + ImmField == COffsetVal); + assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); + assert(RemainderOffset + ImmField == COffsetVal); - OffsetVal = ImmField; + OffsetVal = ImmField; - // TODO: Should this try to use a scalar add pseudo if the base address is - // uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + // TODO: Should this try to use a scalar add pseudo if the base address + // is uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, N0, Sub1); - SDValue AddOffsetLo - = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue AddOffsetHi - = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - SDNode *Add = CurDAG->getMachineNode( - AMDGPU::V_ADD_I32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 - }; + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), 0); + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } @@ -1761,35 +1781,52 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { - - // FIXME: Handle non-constant offsets. ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); - if (!C) + if (!C) { + if (ByteOffsetNode.getValueType().isScalarInteger() && + ByteOffsetNode.getValueType().getSizeInBits() == 32) { + Offset = ByteOffsetNode; + Imm = false; + return true; + } + if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { + if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { + Offset = ByteOffsetNode.getOperand(0); + Imm = false; + return true; + } + } return false; + } SDLoc SL(ByteOffsetNode); - GCNSubtarget::Generation Gen = Subtarget->getGeneration(); + // GFX9 and GFX10 have signed byte immediate offsets. int64_t ByteOffset = C->getSExtValue(); - int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); - - if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { - Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + Optional<int64_t> EncodedOffset = + AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); + if (EncodedOffset) { + Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); Imm = true; return true; } - if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) + // SGPR and literal offsets are unsigned. + if (ByteOffset < 0) return false; - if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { - // 32-bit Immediates are supported on Sea Islands. - Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); - } else { - SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); - Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, - C32Bit), 0); + EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); + if (EncodedOffset) { + Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); + return true; } - Imm = false; + + if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) + return false; + + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + return true; } @@ -1825,14 +1862,21 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, // A 32-bit (address + offset) should not cause unsigned 32-bit integer // wraparound, because s_load instructions perform the addition in 64 bits. if ((Addr.getValueType() != MVT::i32 || - Addr->getFlags().hasNoUnsignedWrap()) && - CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - - if (SelectSMRDOffset(N1, Offset, Imm)) { - SBase = Expand32BitAddress(N0); - return true; + Addr->getFlags().hasNoUnsignedWrap())) { + SDValue N0, N1; + // Extract the base and offset if possible. + if (CurDAG->isBaseWithConstantOffset(Addr) || + Addr.getOpcode() == ISD::ADD) { + N0 = Addr.getOperand(0); + N1 = Addr.getOperand(1); + } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { + assert(N0 && N1 && isa<ConstantSDNode>(N1)); + } + if (N0 && N1) { + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = Expand32BitAddress(N0); + return true; + } } } SBase = Expand32BitAddress(Addr); @@ -1843,17 +1887,16 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm; + bool Imm = false; return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) - return false; + assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - bool Imm; + bool Imm = false; if (!SelectSMRD(Addr, SBase, Offset, Imm)) return false; @@ -1862,27 +1905,38 @@ bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm; + bool Imm = false; return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && !isa<ConstantSDNode>(Offset); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const { - bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && Imm; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { + // The immediate offset for S_BUFFER instructions is unsigned. + if (auto Imm = + AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) { + Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); + return true; + } + } + + return false; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const { - if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) - return false; + assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - bool Imm; - if (!SelectSMRDOffset(Addr, Offset, Imm)) - return false; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) { + if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, + C->getZExtValue())) { + Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); + return true; + } + } - return !Imm && isa<ConstantSDNode>(Offset); + return false; } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, @@ -1898,7 +1952,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, // (add n0, c0) // Don't peel off the offset (c0) if doing so could possibly lead // the base (n0) to be negative. - if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { + // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset. + if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) || + (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) { Base = N0; Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); return true; @@ -2066,7 +2122,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; - unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC(); + Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC(); SDLoc SL(N); if (!UseSCCBr) { @@ -2121,7 +2177,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert((IsFMA || !Mode.FP32Denormals) && + assert((IsFMA || !Mode.allFP32Denormals()) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). @@ -2338,6 +2394,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } +void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) { + if (Subtarget->getLDSBankCount() != 16) { + // This is a single instruction with a pattern. + SelectCode(N); + return; + } + + SDLoc DL(N); + + // This requires 2 instructions. It is possible to write a pattern to support + // this, but the generated isel emitter doesn't correctly deal with multiple + // output instructions using the same physical register input. The copy to m0 + // is incorrectly placed before the second instruction. + // + // TODO: Match source modifiers. + // + // def : Pat < + // (int_amdgcn_interp_p1_f16 + // (VOP3Mods f32:$src0, i32:$src0_modifiers), + // (i32 timm:$attrchan), (i32 timm:$attr), + // (i1 timm:$high), M0), + // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr, + // timm:$attrchan, 0, + // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> { + // let Predicates = [has16BankLDS]; + // } + + // 16 bank LDS + SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0, + N->getOperand(5), SDValue()); + + SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other); + + SDNode *InterpMov = + CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, { + CurDAG->getTargetConstant(2, DL, MVT::i32), // P0 + N->getOperand(3), // Attr + N->getOperand(2), // Attrchan + ToM0.getValue(1) // In glue + }); + + SDNode *InterpP1LV = + CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, { + CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers + N->getOperand(1), // Src0 + N->getOperand(3), // Attr + N->getOperand(2), // Attrchan + CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers + SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high + N->getOperand(4), // high + CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp + CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod + SDValue(InterpMov, 1) + }); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0)); +} + void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IntrID) { @@ -2366,6 +2480,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_wwm: Opcode = AMDGPU::WWM; break; + case Intrinsic::amdgcn_interp_p1_f16: + SelectInterpP1F16(N); + return; default: SelectCode(N); return; @@ -2428,15 +2545,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, return isNoNanSrc(Src); } -bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - if (In.getValueType() == MVT::f32) - return SelectVOP3Mods(In, Src, SrcMods); - Src = In; - SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);; - return true; -} - bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) return false; @@ -2520,17 +2628,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, return true; } -bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Clamp) const { - SDLoc SL(In); - - // FIXME: Handle clamp and op_sel - Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); - - return SelectVOP3PMods(In, Src, SrcMods); -} - bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; @@ -2539,34 +2636,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, return true; } -bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Clamp) const { - SDLoc SL(In); - - // FIXME: Handle clamp - Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); - - return SelectVOP3OpSel(In, Src, SrcMods); -} - bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { // FIXME: Handle op_sel return SelectVOP3Mods(In, Src, SrcMods); } -bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Clamp) const { - SDLoc SL(In); - - // FIXME: Handle clamp - Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); - - return SelectVOP3OpSelMods(In, Src, SrcMods); -} - // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, @@ -2705,7 +2780,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { ( Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && - !Ld->isVolatile() && + Ld->isSimple() && !N->isDivergent() && static_cast<const SITargetLowering *>( getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 23cc9404532d5..940ec6f31c698 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -16,7 +16,6 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" @@ -38,6 +37,11 @@ using namespace llvm; #include "AMDGPUGenCallingConv.inc" +static cl::opt<bool> AMDGPUBypassSlowDiv( + "amdgpu-bypass-slow-div", + cl::desc("Skip 64-bit divide for dynamic 32-bit values"), + cl::init(true)); + // Find a larger type to do a load / store of a vector with. EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { unsigned StoreSize = VT.getStoreSizeInBits(); @@ -103,6 +107,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v4i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v4f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v8i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::v8f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::v16i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); + + setOperationAction(ISD::LOAD, MVT::v16f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); + // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. for (MVT VT : MVT::integer_valuetypes()) { @@ -161,11 +183,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); @@ -203,6 +227,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v4i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v4f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v8i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v8f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v16i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); + + setOperationAction(ISD::STORE, MVT::v16f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); @@ -227,12 +269,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); + setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); + setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); @@ -297,6 +348,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); @@ -329,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, VT, Legal); } + // The hardware supports 32-bit FSHR, but not FSHL. + setOperationAction(ISD::FSHR, MVT::i32, Legal); + // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); @@ -381,7 +443,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); @@ -483,6 +545,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MaxStoresPerMemmove = 0xffffffff; MaxStoresPerMemset = 0xffffffff; + // The expansion for 64-bit division is enormous. + if (AMDGPUBypassSlowDiv) + addBypassSlowDiv(64, 32); + setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); @@ -609,6 +675,17 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, return true; } +EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + assert(!VT.isVector() && "only scalar expected"); + + // Round to the next multiple of 32-bits. + unsigned Size = VT.getSizeInBits(); + if (Size <= 32) + return MVT::i32; + return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -641,8 +718,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, unsigned NewSize = NewVT.getStoreSizeInBits(); - // If we are reducing to a 32-bit load, this is always better. - if (NewSize == 32) + // If we are reducing to a 32-bit load or a smaller multi-dword load, + // this is always better. + if (NewSize >= 32) return true; EVT OldVT = N->getValueType(0); @@ -733,6 +811,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { } } +SDValue AMDGPUTargetLowering::getNegatedExpression( + SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, unsigned Depth) const { + + switch (Op.getOpcode()) { + case ISD::FMA: + case ISD::FMAD: { + // Negating a fma is not free if it has users without source mods. + if (!allUsesHaveSourceMods(Op.getNode())) + return SDValue(); + break; + } + default: + break; + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, + ForCodeSize, Cost, Depth); +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// @@ -912,7 +1010,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); CallingConv::ID CC = Fn.getCallingConv(); - unsigned MaxAlign = 1; + Align MaxAlign = Align(1); uint64_t ExplicitArgOffset = 0; const DataLayout &DL = Fn.getParent()->getDataLayout(); @@ -920,12 +1018,12 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( for (const Argument &Arg : Fn.args()) { Type *BaseArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(BaseArgTy); - MaxAlign = std::max(Align, MaxAlign); + Align Alignment = DL.getABITypeAlign(BaseArgTy); + MaxAlign = std::max(Alignment, MaxAlign); unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); - uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; // We're basically throwing away everything passed into us and starting over // to get accurate in-memory offsets. The "PartOffset" is completely useless @@ -999,6 +1097,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( assert(MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); + } else if (!MemVT.isSimple() && !MemVT.isVector()) { + MemVT = MemVT.getRoundIntegerType(State.getContext()); } unsigned PartOffset = 0; @@ -1140,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); + return LowerFLOG(Op, DAG, numbers::ln2f); case ISD::FLOG10: return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); case ISD::FEXP: @@ -1196,10 +1296,23 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isEntryFunction()) { + SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( - Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc()); + Fn, "local memory global used by non-kernel function", + DL.getDebugLoc(), DS_Warning); DAG.getContext()->diagnose(BadLDSDecl); + + // We currently don't have a way to correctly allocate LDS objects that + // aren't directly associated with a kernel. We do force inlining of + // functions that use local objects. However, if these dead functions are + // not eliminated, we don't want a compile time error. Just emit a warning + // and a trap, since there should be no callable path here. + SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Trap, DAG.getRoot()); + DAG.setRoot(OutputChain); + return DAG.getUNDEF(Op.getValueType()); } // XXX: What does the value of G->getOffset() mean? @@ -1208,7 +1321,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, // TODO: We could emit code to handle the initialization somewhere. if (!hasDefinedInitializer(GV)) { - unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); } } @@ -1383,12 +1496,11 @@ AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && "More vector elements requested than available!"); - auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, - DAG.getConstant(0, DL, IdxTy)); + DAG.getVectorIdxConstant(0, DL)); SDValue Hi = DAG.getNode( HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, - HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); + HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); return std::make_pair(Lo, Hi); } @@ -1433,18 +1545,17 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); - auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); SDValue Join; if (LoVT == HiVT) { // This is the case that the vector is power of two so was evenly split. Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); } else { Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, - DAG.getConstant(0, SL, IdxTy)); - Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR - : ISD::INSERT_VECTOR_ELT, - SL, VT, Join, HiLoad, - DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); + DAG.getVectorIdxConstant(0, SL)); + Join = DAG.getNode( + HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, + VT, Join, HiLoad, + DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); } SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, @@ -1474,7 +1585,7 @@ SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); return DAG.getMergeValues( {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, - DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), + DAG.getVectorIdxConstant(0, SL)), WideLoad.getValue(1)}, SL); } @@ -1588,9 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); // float fr = mad(fqneg, fb, fa); - unsigned OpCode = MFI->getMode().FP32Denormals ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; @@ -1673,9 +1786,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // Compute denominator reciprocal. - unsigned FMAD = MFI->getMode().FP32Denormals ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned FMAD = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); @@ -1861,103 +1976,43 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return Res; } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - - // RCP_LO = mul(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); - - // RCP_HI = mulhu (RCP, Den) */ - SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - NEG_RCP_LO, RCP_LO, - ISD::SETEQ); - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); - - // RCP_S_E = RCP - E - SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - RCP_A_E, RCP_S_E, - ISD::SETEQ); - // Quotient = mulhu(Tmp0, Num) - SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); - - // Remainder = Num - Num_S_Remainder - SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); - - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, - Num_S_Remainder, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, - Remainder_GE_Zero); - - // Calculate Division result: - - // Quotient_A_One = Quotient + 1 - SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Quotient_S_One = Quotient - 1 - SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Quotient, Quotient_A_One, ISD::SETEQ); - - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Quotient_S_One, Div, ISD::SETEQ); - - // Calculate Rem result: - - // Remainder_S_Den = Remainder - Den - SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); - - // Remainder_A_Den = Remainder + Den - SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); - - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Remainder, Remainder_S_Den, ISD::SETEQ); - - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Ops, DL); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the + // algorithm used here. + + // Initial estimate of inv(y). + SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); + + // One round of UNR. + SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); + SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); + Z = DAG.getNode(ISD::ADD, DL, VT, Z, + DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); + + // Quotient/remainder estimate. + SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); + SDValue R = + DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); + + // First quotient/remainder refinement. + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + // Second quotient/remainder refinement. + Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + return DAG.getMergeValues({Q, R}, DL); } SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, @@ -2164,8 +2219,7 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con // Don't handle v2f16. The extra instructions to scalarize and repack around the // compare and vselect end up producing worse code than scalarizing the whole // operation. -SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); EVT VT = Op.getValueType(); @@ -2194,75 +2248,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op, return DAG.getNode(ISD::FADD, SL, VT, T, Sel); } -SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); - const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, - MVT::i64); - - SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); - SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, - DAG.getConstant(INT64_C(0x0008000000000000), SL, - MVT::i64), - Exp); - - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); - SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, - DAG.getConstant(0, SL, MVT::i64), Tmp0, - ISD::SETNE); - - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, - D, DAG.getConstant(0, SL, MVT::i64)); - SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); - - K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); - K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); - - SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, - ExpEqNegOne, - DAG.getConstantFP(1.0, SL, MVT::f64), - DAG.getConstantFP(0.0, SL, MVT::f64)); - - SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); - - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); - - return K; -} - -SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (isOperationLegal(ISD::FTRUNC, VT)) - return LowerFROUND_LegalFTRUNC(Op, DAG); - - if (VT == MVT::f64) - return LowerFROUND64(Op, DAG); - - llvm_unreachable("unhandled type"); -} - SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -2793,6 +2778,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); @@ -2806,11 +2792,11 @@ static SDValue simplifyI24(SDNode *Node24, APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); - // First try to simplify using GetDemandedBits which allows the operands to - // have other uses, but will only perform simplifications that involve - // bypassing some nodes for this user. - SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); - SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); + // First try to simplify using SimplifyMultipleUseDemandedBits which allows + // the operands to have other uses, but will only perform simplifications that + // involve bypassing some nodes for this user. + SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); + SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); if (DemandedLHS || DemandedRHS) return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, @@ -2818,7 +2804,6 @@ static SDValue simplifyI24(SDNode *Node24, // Now try SimplifyDemandedBits which can simplify the nodes used by our // operands if this node is the only user. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) return SDValue(Node24, 0); if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) @@ -2877,7 +2862,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, return SDValue(); LoadSDNode *LN = cast<LoadSDNode>(N); - if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) return SDValue(); SDLoc SL(N); @@ -2885,16 +2870,17 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, EVT VT = LN->getMemoryVT(); unsigned Size = VT.getStoreSize(); - unsigned Align = LN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = LN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = LN->getAddressSpace(); // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + LN->getMemOperand()->getFlags(), + &IsFast)) { SDValue Ops[2]; if (VT.isVector()) @@ -2931,7 +2917,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, return SDValue(); StoreSDNode *SN = cast<StoreSDNode>(N); - if (SN->isVolatile() || !ISD::isNormalStore(SN)) + if (!SN->isSimple() || !ISD::isNormalStore(SN)) return SDValue(); EVT VT = SN->getMemoryVT(); @@ -2939,8 +2925,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - unsigned Align = SN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = SN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = SN->getAddressSpace(); @@ -2948,8 +2934,9 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + SN->getMemOperand()->getFlags(), + &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); @@ -3012,6 +2999,16 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_mul_u24: return simplifyI24(N, DCI); + case Intrinsic::amdgcn_fract: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_ldexp: { + // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted + SDValue Src = N->getOperand(1); + return Src.isUndef() ? Src : SDValue(); + } default: return SDValue(); } @@ -3465,24 +3462,24 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); - unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : - AMDGPUISD::FFBH_U32; - // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && - RHS.getOperand(0) == CmpLHS && - isNegativeOne(LHS)) { + RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { + unsigned Opc = + isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && - LHS.getOperand(0) == CmpLHS && - isNegativeOne(RHS)) { + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && + LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { + unsigned Opc = + isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } @@ -4117,12 +4114,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT, + Register Reg, EVT VT, const SDLoc &SL, bool RawReg) const { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VReg; + Register VReg; if (!MRI.isLiveIn(Reg)) { VReg = MRI.createVirtualRegister(RC); @@ -4266,11 +4263,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) NODE_NAME_CASE(FMAD_FTZ) - NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) - NODE_NAME_CASE(RSQ_LEGACY) NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) @@ -4298,8 +4293,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MAD_U64_U32) NODE_NAME_CASE(PERM) NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(EXPORT) - NODE_NAME_CASE(EXPORT_DONE) NODE_NAME_CASE(R600_EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -4323,12 +4316,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) - NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INTERP_P1LL_F16) - NODE_NAME_CASE(INTERP_P1LV_F16) - NODE_NAME_CASE(INTERP_P2_F16) NODE_NAME_CASE(LOAD_D16_HI) NODE_NAME_CASE(LOAD_D16_LO) NODE_NAME_CASE(LOAD_D16_HI_I8) @@ -4347,6 +4336,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) + NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -4373,6 +4363,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_INC) NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) NODE_NAME_CASE(ATOMIC_PK_FADD) @@ -4539,11 +4530,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } case AMDGPUISD::LDS: { auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); - unsigned Align = GA->getGlobal()->getAlignment(); + Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); Known.Zero.setHighBits(16); - if (Align) - Known.Zero.setLowBits(Log2_32(Align)); + Known.Zero.setLowBits(Log2(Alignment)); break; } case ISD::INTRINSIC_WO_CHAIN: { @@ -4607,6 +4597,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( } } +unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( + GISelKnownBits &Analysis, Register R, + const APInt &DemandedElts, const MachineRegisterInfo &MRI, + unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + if (!MI) + return 1; + + // TODO: Check range metadata on MMO. + switch (MI->getOpcode()) { + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: + return 25; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: + return 17; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + return 24; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + return 16; + default: + return 1; + } +} + bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, @@ -4648,7 +4661,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case AMDGPUISD::RCP: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RSQ_CLAMP: { if (SNaN) return true; @@ -4665,7 +4677,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case AMDGPUISD::DIV_SCALE: case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: - case AMDGPUISD::TRIG_PREOP: // TODO: Refine on operands. return SNaN; case AMDGPUISD::SIN_HW: @@ -4692,6 +4703,18 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); } + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: { + if (SNaN) + return true; + + // TODO: Need is known positive check. + return false; + } + case Intrinsic::amdgcn_trig_preop: case Intrinsic::amdgcn_fdot2: // TODO: Refine on operand return SNaN; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index a90b7f5653dcc..85f23c81db170 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -18,6 +18,7 @@ #include "AMDGPU.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -52,8 +53,6 @@ protected: SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND_LegalFTRUNC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG, @@ -172,8 +171,16 @@ public: bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, + unsigned Depth) const override; + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const override; + MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; @@ -264,6 +271,12 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, + Register R, + const APInt &DemandedElts, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const override; + bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN = false, @@ -276,19 +289,19 @@ public: /// a copy from the register. SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT, + Register Reg, EVT VT, const SDLoc &SL, bool RawReg = false) const; SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { + Register Reg, EVT VT) const { return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode())); } // Returns the raw live in register rather than a copy from it. SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { + Register Reg, EVT VT) const { return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); } @@ -398,14 +411,12 @@ enum NodeType : unsigned { // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is // treated as an illegal operation. FMAD_FTZ, - TRIG_PREOP, // 1 ULP max error for f64 // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. // For f64, max error 2^29 ULP, handles denormals. RCP, RSQ, RCP_LEGACY, - RSQ_LEGACY, RCP_IFLAG, FMUL_LEGACY, RSQ_CLAMP, @@ -433,8 +444,6 @@ enum NodeType : unsigned { MUL_LOHI_U24, PERM, TEXTURE_FETCH, - EXPORT, // exp on SI+ - EXPORT_DONE, // exp on SI+ with done bit set R600_EXPORT, CONST_ADDRESS, REGISTER_LOAD, @@ -476,12 +485,8 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, - INTERP_P1LL_F16, - INTERP_P1LV_F16, - INTERP_P2_F16, PC_ADD_REL_OFFSET, LDS, - KILL, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, LOAD_D16_HI, @@ -503,6 +508,7 @@ enum NodeType : unsigned { ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, + ATOMIC_LOAD_CSUB, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, @@ -529,6 +535,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_INC, BUFFER_ATOMIC_DEC, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_PK_FADD, ATOMIC_PK_FADD, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp index 64d761997b0cc..3b5d91133a2f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -23,7 +23,6 @@ #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -67,9 +66,9 @@ public: static char ID; // Pass identification, replacement for typeid - unsigned getInlineThreshold(CallSite CS) const; + unsigned getInlineThreshold(CallBase &CB) const; - InlineCost getInlineCost(CallSite CS) override; + InlineCost getInlineCost(CallBase &CB) override; bool runOnSCC(CallGraphSCC &SCC) override; @@ -106,13 +105,13 @@ void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { LegacyInlinerBase::getAnalysisUsage(AU); } -unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { +unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const { int Thres = Params.DefaultThreshold; - Function *Caller = CS.getCaller(); + Function *Caller = CB.getCaller(); // Listen to the inlinehint attribute when it would increase the threshold // and the caller does not need to minimize its size. - Function *Callee = CS.getCalledFunction(); + Function *Callee = CB.getCalledFunction(); bool InlineHint = Callee && !Callee->isDeclaration() && Callee->hasFnAttribute(Attribute::InlineHint); if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres @@ -129,7 +128,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { // Increase the inline threshold to allow inliniting in this case. uint64_t AllocaSize = 0; SmallPtrSet<const AllocaInst *, 8> AIVisited; - for (Value *PtrArg : CS.args()) { + for (Value *PtrArg : CB.args()) { PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) @@ -156,8 +155,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { // Check if call is just a wrapper around another call. // In this case we only have call and ret instructions. -static bool isWrapperOnlyCall(CallSite CS) { - Function *Callee = CS.getCalledFunction(); +static bool isWrapperOnlyCall(CallBase &CB) { + Function *Callee = CB.getCalledFunction(); if (!Callee || Callee->size() != 1) return false; const BasicBlock &BB = Callee->getEntryBlock(); @@ -174,32 +173,32 @@ static bool isWrapperOnlyCall(CallSite CS) { return false; } -InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { - Function *Callee = CS.getCalledFunction(); - Function *Caller = CS.getCaller(); +InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) { + Function *Callee = CB.getCalledFunction(); + Function *Caller = CB.getCaller(); if (!Callee || Callee->isDeclaration()) return llvm::InlineCost::getNever("undefined callee"); - if (CS.isNoInline()) + if (CB.isNoInline()) return llvm::InlineCost::getNever("noinline"); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); if (!TTI.areInlineCompatible(Caller, Callee)) return llvm::InlineCost::getNever("incompatible"); - if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (CB.hasFnAttr(Attribute::AlwaysInline)) { auto IsViable = isInlineViable(*Callee); - if (IsViable) + if (IsViable.isSuccess()) return llvm::InlineCost::getAlways("alwaysinline viable"); - return llvm::InlineCost::getNever(IsViable.message); + return llvm::InlineCost::getNever(IsViable.getFailureReason()); } - if (isWrapperOnlyCall(CS)) + if (isWrapperOnlyCall(CB)) return llvm::InlineCost::getAlways("wrapper-only call"); InlineParams LocalParams = Params; - LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + LocalParams.DefaultThreshold = (int)getInlineThreshold(CB); bool RemarksEnabled = false; const auto &BBs = Caller->getBasicBlockList(); if (!BBs.empty()) { @@ -209,14 +208,13 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { } OptimizationRemarkEmitter ORE(Caller); - std::function<AssumptionCache &(Function &)> GetAssumptionCache = - [this](Function &F) -> AssumptionCache & { + auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee, - LocalParams, TTI, GetAssumptionCache, None, PSI, - RemarksEnabled ? &ORE : nullptr); + auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI, + GetAssumptionCache, GetTLI, nullptr, PSI, + RemarksEnabled ? &ORE : nullptr); if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { // Single BB does not increase total BB amount, thus subtract 1 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 9951cbf2326e3..6c13bc8599dbb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstrInfo.h" -#include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 698189e14c21e..61b78acad3f4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -55,6 +55,9 @@ struct ImageDimIntrinsicInfo { }; const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr); +const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode, + unsigned Dim); + } // end AMDGPU namespace } // End llvm namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 50c451be4b867..894677ec68b60 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains DAG node defintions for the AMDGPU target. +// This file contains DAG node definitions for the AMDGPU target. // //===----------------------------------------------------------------------===// @@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; -def AMDGPUTrigPreOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - def AMDGPULdExpOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] >; @@ -121,8 +117,6 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; -// out = 1.0 / sqrt(a) -def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; @@ -151,7 +145,7 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, [] >; -def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, +def AMDGPUfmul_legacy_impl : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -204,13 +198,6 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>; -def AMDGPUSetRegOp : SDTypeProfile<0, 2, [ - SDTCisInt<0>, SDTCisInt<1> -]>; - -def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [ - SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; - def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; @@ -238,7 +225,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; // Special case divide FMA with scale and flags (src0 = Quotient, // src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, +def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp, [SDNPOptInGlue]>; // Single or double precision division fixup. @@ -248,9 +235,6 @@ def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; -// Look Up 2.0 / pi src0 with segment select src1[4:0] -def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; - def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; @@ -278,18 +262,18 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; -def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; -def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; -def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>; +def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; -def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; +def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore -// when performing the mulitply. The result is a 32-bit value. +// when performing the multiply. The result is a 32-bit value. def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -321,7 +305,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; -def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", +def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisVec<1>, SDTCisInt<4>]>, @@ -329,21 +313,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; -def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", - SDTypeProfile<1, 7, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", - SDTypeProfile<1, 9, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", - SDTypeProfile<1, 8, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, - [SDNPHasChain, SDNPSideEffect]>; - // SI+ export def AMDGPUExportOp : SDTypeProfile<0, 8, [ SDTCisInt<0>, // i8 tgt @@ -358,12 +327,6 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [ ]>; -def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp, - [SDNPHasChain, SDNPMayStore]>; - -def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp, - [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; - def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; @@ -398,7 +361,7 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt //===----------------------------------------------------------------------===// -// Intrinsic/Custom node compatability PatFrags +// Intrinsic/Custom node compatibility PatFrags //===----------------------------------------------------------------------===// def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src), @@ -406,9 +369,6 @@ def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src), def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src), (AMDGPUrcp_legacy_impl node:$src)]>; -def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src), - (AMDGPUrsq_legacy_impl node:$src)]>; - def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src), (AMDGPUrsq_impl node:$src)]>; @@ -442,6 +402,14 @@ def AMDGPUffbh_i32 : PatFrags<(ops node:$src), [(int_amdgcn_sffbh node:$src), (AMDGPUffbh_i32_impl node:$src)]>; +def AMDGPUffbh_u32 : PatFrags<(ops node:$src), + [(ctlz_zero_undef node:$src), + (AMDGPUffbh_u32_impl node:$src)]>; + +def AMDGPUffbl_b32 : PatFrags<(ops node:$src), + [(cttz_zero_undef node:$src), + (AMDGPUffbl_b32_impl node:$src)]>; + def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1), (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>; @@ -473,3 +441,23 @@ def AMDGPUmul_u24 : PatFrags<(ops node:$src0, node:$src1), def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_mul_i24 node:$src0, node:$src1), (AMDGPUmul_i24_impl node:$src0, node:$src1)]>; + +def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2), + (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2), + (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>; + +def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_fmul_legacy node:$src0, node:$src1), + (AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>; + +def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp), + [(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$clamp), + (AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$clamp)]>; + +def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc), + [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc), + (AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c0ea35817ec8e..2025c0fa5d21b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -15,7 +15,6 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPURegisterBankInfo.h" -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -40,6 +39,12 @@ using namespace llvm; using namespace MIPatternMatch; +static cl::opt<bool> AllowRiskySelect( + "amdgpu-global-isel-risky-select", + cl::desc("Allow GlobalISel to select cases that are likely to not work yet"), + cl::init(false), + cl::ReallyHidden); + #define GET_GLOBALISEL_IMPL #define AMDGPUSubtarget GCNSubtarget #include "AMDGPUGenGlobalISel.inc" @@ -88,6 +93,30 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, return RB->getID() == AMDGPU::VCCRegBankID; } +bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, + unsigned NewOpc) const { + MI.setDesc(TII.get(NewOpc)); + MI.RemoveOperand(1); // Remove intrinsic ID. + MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + + MachineOperand &Dst = MI.getOperand(0); + MachineOperand &Src = MI.getOperand(1); + + // TODO: This should be legalized to s32 if needed + if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) + return false; + + const TargetRegisterClass *DstRC + = TRI.getConstrainedRegClassForOperand(Dst, *MRI); + const TargetRegisterClass *SrcRC + = TRI.getConstrainedRegClassForOperand(Src, *MRI); + if (!DstRC || DstRC != SrcRC) + return false; + + return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && + RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); +} + bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock *BB = I.getParent(); @@ -173,6 +202,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI->getType(DefReg); + if (DefTy == LLT::scalar(1)) { + if (!AllowRiskySelect) { + LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n"); + } // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) @@ -261,6 +298,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), RC == &AMDGPU::SReg_64RegClass); I.setDesc(TII.get(InstOpc)); + // Dead implicit-def of scc + I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef + true, // isImp + false, // isKill + true)); // isDead // FIXME: Hack to avoid turning the register bank into a register class. // The selector for G_ICMP relies on seeing the register bank for the result @@ -295,7 +337,11 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineFunction *MF = BB->getParent(); Register DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); - unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); + LLT Ty = MRI->getType(DstReg); + if (Ty.isVector()) + return false; + + unsigned Size = Ty.getSizeInBits(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; @@ -445,6 +491,7 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( return true; } +// TODO: We should probably legalize these to only using 32-bit results. bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); Register DstReg = I.getOperand(0).getReg(); @@ -452,11 +499,21 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { LLT DstTy = MRI->getType(DstReg); LLT SrcTy = MRI->getType(SrcReg); const unsigned SrcSize = SrcTy.getSizeInBits(); - const unsigned DstSize = DstTy.getSizeInBits(); + unsigned DstSize = DstTy.getSizeInBits(); // TODO: Should handle any multiple of 32 offset. unsigned Offset = I.getOperand(2).getImm(); - if (Offset % DstSize != 0) + if (Offset % 32 != 0 || DstSize > 128) + return false; + + // 16-bit operations really use 32-bit registers. + // FIXME: Probably should not allow 16-bit G_EXTRACT results. + if (DstSize == 16) + DstSize = 32; + + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); + if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -464,20 +521,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); if (!SrcRC) return false; + unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, + DstSize / 32); + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); + if (!SrcRC) + return false; - ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); - + SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, + *SrcRC, I.getOperand(1)); const DebugLoc &DL = I.getDebugLoc(); - MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) - .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); + BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(SrcReg, 0, SubReg); - for (const MachineOperand &MO : Copy->operands()) { - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, *MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); - } I.eraseFromParent(); return true; } @@ -563,6 +618,90 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { return true; } +static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { + int64_t Val; + return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; +} + +bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( + MachineInstr &MI) const { + if (selectImpl(MI, *CoverageInfo)) + return true; + + const LLT S32 = LLT::scalar(32); + const LLT V2S16 = LLT::vector(2, 16); + + Register Dst = MI.getOperand(0).getReg(); + if (MRI->getType(Dst) != V2S16) + return false; + + const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); + if (DstBank->getID() != AMDGPU::SGPRRegBankID) + return false; + + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + if (MRI->getType(Src0) != S32) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *BB = MI.getParent(); + + // TODO: This should probably be a combine somewhere + // (build_vector_trunc $src0, undef -> copy $src0 + MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); + if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { + MI.setDesc(TII.get(AMDGPU::COPY)); + MI.RemoveOperand(2); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && + RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); + } + + Register ShiftSrc0; + Register ShiftSrc1; + int64_t ShiftAmt; + + // With multiple uses of the shift, this will duplicate the shift and + // increase register pressure. + // + // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) + // => (S_PACK_HH_B32_B16 $src0, $src1) + // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) + // => (S_PACK_LH_B32_B16 $src0, $src1) + // (build_vector_trunc $src0, $src1) + // => (S_PACK_LL_B32_B16 $src0, $src1) + + // FIXME: This is an inconvenient way to check a specific value + bool Shift0 = mi_match( + Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) && + ShiftAmt == 16; + + bool Shift1 = mi_match( + Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) && + ShiftAmt == 16; + + unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; + if (Shift0 && Shift1) { + Opc = AMDGPU::S_PACK_HH_B32_B16; + MI.getOperand(1).setReg(ShiftSrc0); + MI.getOperand(2).setReg(ShiftSrc1); + } else if (Shift1) { + Opc = AMDGPU::S_PACK_LH_B32_B16; + MI.getOperand(2).setReg(ShiftSrc1); + } else if (Shift0 && isZero(Src1, *MRI)) { + // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 + auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) + .addReg(ShiftSrc0) + .addImm(16); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + } + + MI.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const { return selectG_ADD_SUB(I); } @@ -594,7 +733,9 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { unsigned InsSize = Src1Ty.getSizeInBits(); int64_t Offset = I.getOperand(3).getImm(); - if (Offset % 32 != 0) + + // FIXME: These cases should have been illegal and unnecessary to check here. + if (Offset % 32 != 0 || InsSize % 32 != 0) return false; unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); @@ -617,7 +758,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { // Deal with weird cases where the class only partially supports the subreg // index. Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); - if (!Src0RC) + if (!Src0RC || !Src1RC) return false; if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || @@ -635,6 +776,85 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { + if (STI.getLDSBankCount() != 16) + return selectImpl(MI, *CoverageInfo); + + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(2).getReg(); + Register M0Val = MI.getOperand(6).getReg(); + if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || + !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + + // This requires 2 instructions. It is possible to write a pattern to support + // this, but the generated isel emitter doesn't correctly deal with multiple + // output instructions using the same physical register input. The copy to m0 + // is incorrectly placed before the second instruction. + // + // TODO: Match source modifiers. + + Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0Val); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) + .addImm(2) + .addImm(MI.getOperand(4).getImm()) // $attr + .addImm(MI.getOperand(3).getImm()); // $attrchan + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) + .addImm(0) // $src0_modifiers + .addReg(Src0) // $src0 + .addImm(MI.getOperand(4).getImm()) // $attr + .addImm(MI.getOperand(3).getImm()) // $attrchan + .addImm(0) // $src2_modifiers + .addReg(InterpMov) // $src2 - 2 f16 values selected by high + .addImm(MI.getOperand(5).getImm()) // $high + .addImm(0) // $clamp + .addImm(0); // $omod + + MI.eraseFromParent(); + return true; +} + +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { + Register Dst0 = MI.getOperand(0).getReg(); + Register Dst1 = MI.getOperand(1).getReg(); + + LLT Ty = MRI->getType(Dst0); + unsigned Opc; + if (Ty == LLT::scalar(32)) + Opc = AMDGPU::V_DIV_SCALE_F32; + else if (Ty == LLT::scalar(64)) + Opc = AMDGPU::V_DIV_SCALE_F64; + else + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + + Register Numer = MI.getOperand(3).getReg(); + Register Denom = MI.getOperand(4).getReg(); + unsigned ChooseDenom = MI.getOperand(5).getImm(); + + Register Src0 = ChooseDenom != 0 ? Numer : Denom; + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) + .addDef(Dst1) + .addUse(Src0) + .addUse(Denom) + .addUse(Numer); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { unsigned IntrinsicID = I.getIntrinsicID(); switch (IntrinsicID) { @@ -659,6 +879,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return true; } + case Intrinsic::amdgcn_interp_p1_f16: + return selectInterpP1F16(I); + case Intrinsic::amdgcn_wqm: + return constrainCopyLikeIntrin(I, AMDGPU::WQM); + case Intrinsic::amdgcn_softwqm: + return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); + case Intrinsic::amdgcn_wwm: + return constrainCopyLikeIntrin(I, AMDGPU::WWM); + case Intrinsic::amdgcn_div_scale: + return selectDivScale(I); + case Intrinsic::amdgcn_icmp: + return selectIntrinsicIcmp(I); + case Intrinsic::amdgcn_ballot: + return selectBallot(I); default: return selectImpl(I, *CoverageInfo); } @@ -779,247 +1013,79 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const { return Ret; } -static MachineInstr * -buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, - unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, - unsigned VM, bool Compr, unsigned Enabled, bool Done) { - const DebugLoc &DL = Insert->getDebugLoc(); - MachineBasicBlock &BB = *Insert->getParent(); - unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; - return BuildMI(BB, Insert, DL, TII.get(Opcode)) - .addImm(Tgt) - .addReg(Reg0) - .addReg(Reg1) - .addReg(Reg2) - .addReg(Reg3) - .addImm(VM) - .addImm(Compr) - .addImm(Enabled); -} - -static bool isZero(Register Reg, MachineRegisterInfo &MRI) { - int64_t C; - if (mi_match(Reg, MRI, m_ICst(C)) && C == 0) - return true; - - // FIXME: matcher should ignore copies - return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; -} +bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { + Register Dst = I.getOperand(0).getReg(); + if (isVCC(Dst, *MRI)) + return false; -static unsigned extractGLC(unsigned AuxiliaryData) { - return AuxiliaryData & 1; -} + if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize()) + return false; -static unsigned extractSLC(unsigned AuxiliaryData) { - return (AuxiliaryData >> 1) & 1; -} + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register SrcReg = I.getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); + auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); -static unsigned extractDLC(unsigned AuxiliaryData) { - return (AuxiliaryData >> 2) & 1; -} + int Opcode = getV_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; -static unsigned extractSWZ(unsigned AuxiliaryData) { - return (AuxiliaryData >> 3) & 1; + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) + .add(I.getOperand(2)) + .add(I.getOperand(3)); + RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(), + *MRI); + bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); + I.eraseFromParent(); + return Ret; } -static unsigned getBufferStoreOpcode(LLT Ty, - const unsigned MemSize, - const bool Offen) { - const int Size = Ty.getSizeInBits(); - switch (8 * MemSize) { - case 8: - return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : - AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; - case 16: - return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : - AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; - default: - unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : - AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; - if (Size > 32) - Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); - return Opc; - } -} - -static unsigned getBufferStoreFormatOpcode(LLT Ty, - const unsigned MemSize, - const bool Offen) { - bool IsD16Packed = Ty.getScalarSizeInBits() == 16; - bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits(); - int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; - - if (IsD16Packed) { - switch (NumElts) { - case 1: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; - case 2: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact; - case 3: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact; - case 4: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact; - default: - return -1; - } - } - - if (IsD16Unpacked) { - switch (NumElts) { - case 1: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact; - case 2: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact; - case 3: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact; - case 4: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact; - default: - return -1; - } - } - - switch (NumElts) { - case 1: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact; - case 2: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact; - case 3: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact; - case 4: - return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact : - AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact; - default: - return -1; - } +bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = I.getOperand(0).getReg(); + const unsigned Size = MRI->getType(DstReg).getSizeInBits(); + const bool Is64 = Size == 64; - llvm_unreachable("unhandled buffer store"); -} - -// TODO: Move this to combiner -// Returns base register, imm offset, total constant offset. -std::tuple<Register, unsigned, unsigned> -AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B, - Register OrigOffset) const { - const unsigned MaxImm = 4095; - Register BaseReg; - unsigned TotalConstOffset; - MachineInstr *OffsetDef; - - std::tie(BaseReg, TotalConstOffset, OffsetDef) - = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset); - - unsigned ImmOffset = TotalConstOffset; - - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store.f - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. - unsigned Overflow = ImmOffset & ~MaxImm; - ImmOffset -= Overflow; - if ((int32_t)Overflow < 0) { - Overflow += ImmOffset; - ImmOffset = 0; - } - - if (Overflow != 0) { - // In case this is in a waterfall loop, insert offset code at the def point - // of the offset, not inside the loop. - MachineBasicBlock::iterator OldInsPt = B.getInsertPt(); - MachineBasicBlock &OldMBB = B.getMBB(); - B.setInstr(*OffsetDef); - - if (!BaseReg) { - BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - B.buildInstr(AMDGPU::V_MOV_B32_e32) - .addDef(BaseReg) - .addImm(Overflow); - } else { - Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - B.buildInstr(AMDGPU::V_MOV_B32_e32) - .addDef(OverflowVal) - .addImm(Overflow); - - Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg) - .addReg(BaseReg) - .addReg(OverflowVal, RegState::Kill) - .addImm(0); - BaseReg = NewBaseReg; - } + if (Size != STI.getWavefrontSize()) + return false; - B.setInsertPt(OldMBB, OldInsPt); + Optional<ValueAndVReg> Arg = + getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); + + if (Arg.hasValue()) { + const int64_t Value = Arg.getValue().Value; + if (Value == 0) { + unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); + } else if (Value == -1) { // all ones + Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); + } else + return false; + } else { + Register SrcReg = I.getOperand(2).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); } - return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); + I.eraseFromParent(); + return true; } -bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, - bool IsFormat) const { - MachineIRBuilder B(MI); - MachineFunction &MF = B.getMF(); - Register VData = MI.getOperand(1).getReg(); - LLT Ty = MRI->getType(VData); - - int Size = Ty.getSizeInBits(); - if (Size % 32 != 0) - return false; - - // FIXME: Verifier should enforce 1 MMO for these intrinsics. - MachineMemOperand *MMO = *MI.memoperands_begin(); - const int MemSize = MMO->getSize(); - - Register RSrc = MI.getOperand(2).getReg(); - Register VOffset = MI.getOperand(3).getReg(); - Register SOffset = MI.getOperand(4).getReg(); - unsigned AuxiliaryData = MI.getOperand(5).getImm(); - unsigned ImmOffset; - unsigned TotalOffset; - - std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); - if (TotalOffset != 0) - MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize); - - const bool Offen = !isZero(VOffset, *MRI); - - int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) : - getBufferStoreOpcode(Ty, MemSize, Offen); - if (Opc == -1) - return false; - - MachineInstrBuilder MIB = B.buildInstr(Opc) - .addUse(VData); - - if (Offen) - MIB.addUse(VOffset); - - MIB.addUse(RSrc) - .addUse(SOffset) - .addImm(ImmOffset) - .addImm(extractGLC(AuxiliaryData)) - .addImm(extractSLC(AuxiliaryData)) - .addImm(0) // tfe: FIXME: Remove from inst - .addImm(extractDLC(AuxiliaryData)) - .addImm(extractSWZ(AuxiliaryData)) - .addMemOperand(MMO); +bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { + // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick + // SelectionDAG uses for wave32 vs wave64. + MachineBasicBlock *BB = MI.getParent(); + BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) + .add(MI.getOperand(1)); + Register Reg = MI.getOperand(1).getReg(); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + if (!MRI->getRegClassOrNull(Reg)) + MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); + return true; } static unsigned getDSShaderTypeValue(const MachineFunction &MF) { @@ -1106,70 +1172,458 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( return Ret; } -bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( - MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - unsigned IntrinsicID = I.getIntrinsicID(); - switch (IntrinsicID) { - case Intrinsic::amdgcn_exp: { - int64_t Tgt = I.getOperand(1).getImm(); - int64_t Enabled = I.getOperand(2).getImm(); - int64_t Done = I.getOperand(7).getImm(); - int64_t VM = I.getOperand(8).getImm(); - - MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), - I.getOperand(4).getReg(), - I.getOperand(5).getReg(), - I.getOperand(6).getReg(), - VM, false, Enabled, Done); +static unsigned gwsIntrinToOpcode(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + return AMDGPU::DS_GWS_INIT; + case Intrinsic::amdgcn_ds_gws_barrier: + return AMDGPU::DS_GWS_BARRIER; + case Intrinsic::amdgcn_ds_gws_sema_v: + return AMDGPU::DS_GWS_SEMA_V; + case Intrinsic::amdgcn_ds_gws_sema_br: + return AMDGPU::DS_GWS_SEMA_BR; + case Intrinsic::amdgcn_ds_gws_sema_p: + return AMDGPU::DS_GWS_SEMA_P; + case Intrinsic::amdgcn_ds_gws_sema_release_all: + return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; + default: + llvm_unreachable("not a gws intrinsic"); + } +} - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); +bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, + Intrinsic::ID IID) const { + if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !STI.hasGWSSemaReleaseAll()) + return false; + + // intrinsic ID, vsrc, offset + const bool HasVSrc = MI.getNumOperands() == 3; + assert(HasVSrc || MI.getNumOperands() == 2); + + Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); + const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); + if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) + return false; + + MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); + assert(OffsetDef); + + unsigned ImmOffset; + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstr *Readfirstlane = nullptr; + + // If we legalized the VGPR input, strip out the readfirstlane to analyze the + // incoming offset, in case there's an add of a constant. We'll have to put it + // back later. + if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { + Readfirstlane = OffsetDef; + BaseOffset = OffsetDef->getOperand(1).getReg(); + OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); } - case Intrinsic::amdgcn_exp_compr: { - const DebugLoc &DL = I.getDebugLoc(); - int64_t Tgt = I.getOperand(1).getImm(); - int64_t Enabled = I.getOperand(2).getImm(); - Register Reg0 = I.getOperand(3).getReg(); - Register Reg1 = I.getOperand(4).getReg(); - Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - int64_t Done = I.getOperand(5).getImm(); - int64_t VM = I.getOperand(6).getImm(); - - BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); - MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, - true, Enabled, Done); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); + if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { + // If we have a constant offset, try to use the 0 in m0 as the base. + // TODO: Look into changing the default m0 initialization value. If the + // default -1 only set the low 16-bits, we could leave it as-is and add 1 to + // the immediate offset. + + ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addImm(0); + } else { + std::tie(BaseOffset, ImmOffset, OffsetDef) + = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset); + + if (Readfirstlane) { + // We have the constant offset now, so put the readfirstlane back on the + // variable component. + if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + + Readfirstlane->getOperand(1).setReg(BaseOffset); + BaseOffset = Readfirstlane->getOperand(0).getReg(); + } else { + if (!RBI.constrainGenericRegister(BaseOffset, + AMDGPU::SReg_32RegClass, *MRI)) + return false; + } + + Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) + .addReg(BaseOffset) + .addImm(16); + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(M0Base); } - case Intrinsic::amdgcn_end_cf: { - // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick - // SelectionDAG uses for wave32 vs wave64. - BuildMI(*BB, &I, I.getDebugLoc(), - TII.get(AMDGPU::SI_END_CF)) - .add(I.getOperand(1)); - Register Reg = I.getOperand(1).getReg(); - I.eraseFromParent(); + // The resource id offset is computed as (<isa opaque base> + M0[21:16] + + // offset field) % 64. Some versions of the programming guide omit the m0 + // part, or claim it's from offset 0. + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); - if (!MRI->getRegClassOrNull(Reg)) - MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); - return true; + if (HasVSrc) { + Register VSrc = MI.getOperand(1).getReg(); + MIB.addReg(VSrc); + if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) + return false; + } + + MIB.addImm(ImmOffset) + .addImm(-1) // $gds + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, + bool IsAppend) const { + Register PtrBase = MI.getOperand(2).getReg(); + LLT PtrTy = MRI->getType(PtrBase); + bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; + + unsigned Offset; + std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); + + // TODO: Should this try to look through readfirstlane like GWS? + if (!isDSOffsetLegal(PtrBase, Offset, 16)) { + PtrBase = MI.getOperand(2).getReg(); + Offset = 0; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(PtrBase); + BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) + .addImm(Offset) + .addImm(IsGDS ? -1 : 0) + .cloneMemRefs(MI); + MI.eraseFromParent(); + return true; +} + +static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, + bool &IsTexFail) { + if (TexFailCtrl) + IsTexFail = true; + + TFE = (TexFailCtrl & 0x1) ? 1 : 0; + TexFailCtrl &= ~(uint64_t)0x1; + LWE = (TexFailCtrl & 0x2) ? 1 : 0; + TexFailCtrl &= ~(uint64_t)0x2; + + return TexFailCtrl == 0; +} + +static bool parseCachePolicy(uint64_t Value, + bool *GLC, bool *SLC, bool *DLC) { + if (GLC) { + *GLC = (Value & 0x1) ? 1 : 0; + Value &= ~(uint64_t)0x1; + } + if (SLC) { + *SLC = (Value & 0x2) ? 1 : 0; + Value &= ~(uint64_t)0x2; + } + if (DLC) { + *DLC = (Value & 0x4) ? 1 : 0; + Value &= ~(uint64_t)0x4; + } + + return Value == 0; +} + +bool AMDGPUInstructionSelector::selectImageIntrinsic( + MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); + unsigned IntrOpcode = Intr->BaseOpcode; + const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10; + + const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode, + MI.getNumExplicitDefs()); + int NumVAddr, NumGradients; + std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode); + + Register VDataIn, VDataOut; + LLT VDataTy; + int NumVDataDwords = -1; + bool IsD16 = false; + + // XXX - Can we just get the second to last argument for ctrl? + unsigned CtrlIdx; // Index of texfailctrl argument + bool Unorm; + if (!BaseOpcode->Sampler) { + Unorm = true; + CtrlIdx = VAddrIdx + NumVAddr + 1; + } else { + Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0; + CtrlIdx = VAddrIdx + NumVAddr + 3; + } + + bool TFE; + bool LWE; + bool IsTexFail = false; + if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail)) + return false; + + const int Flags = MI.getOperand(CtrlIdx + 2).getImm(); + const bool IsA16 = (Flags & 1) != 0; + const bool IsG16 = (Flags & 2) != 0; + + // A16 implies 16 bit gradients + if (IsA16 && !IsG16) + return false; + + unsigned DMask = 0; + unsigned DMaskLanes = 0; + + if (BaseOpcode->Atomic) { + VDataOut = MI.getOperand(0).getReg(); + VDataIn = MI.getOperand(2).getReg(); + LLT Ty = MRI->getType(VDataIn); + + // Be careful to allow atomic swap on 16-bit element vectors. + const bool Is64Bit = BaseOpcode->AtomicX2 ? + Ty.getSizeInBits() == 128 : + Ty.getSizeInBits() == 64; + + if (BaseOpcode->AtomicX2) { + assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); + + DMask = Is64Bit ? 0xf : 0x3; + NumVDataDwords = Is64Bit ? 4 : 2; + } else { + DMask = Is64Bit ? 0x3 : 0x1; + NumVDataDwords = Is64Bit ? 2 : 1; + } + } else { + const int DMaskIdx = 2; // Input/output + intrinsic ID. + + DMask = MI.getOperand(DMaskIdx).getImm(); + DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); + + if (BaseOpcode->Store) { + VDataIn = MI.getOperand(1).getReg(); + VDataTy = MRI->getType(VDataIn); + NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; + } else { + VDataOut = MI.getOperand(0).getReg(); + VDataTy = MRI->getType(VDataOut); + NumVDataDwords = DMaskLanes; + + // One memoperand is mandatory, except for getresinfo. + // FIXME: Check this in verifier. + if (!MI.memoperands_empty()) { + const MachineMemOperand *MMO = *MI.memoperands_begin(); + + // Infer d16 from the memory size, as the register type will be mangled by + // unpacked subtargets, or by TFE. + IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; + + if (IsD16 && !STI.hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; + } + } + } + + // Optimize _L to _LZ when _L is zero + if (LZMappingInfo) { + // The legalizer replaced the register with an immediate 0 if we need to + // change the opcode. + const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); + if (Lod.isImm()) { + assert(Lod.getImm() == 0); + IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l + } + } + + // Optimize _mip away, when 'lod' is zero + if (MIPMappingInfo) { + const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1); + if (Lod.isImm()) { + assert(Lod.getImm() == 0); + IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip + } + } + + // Set G16 opcode + if (IsG16 && !IsA16) { + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + assert(G16MappingInfo); + IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 + } + + // TODO: Check this in verifier. + assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); + + bool GLC = false; + bool SLC = false; + bool DLC = false; + if (BaseOpcode->Atomic) { + GLC = true; // TODO no-return optimization + if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC, + IsGFX10 ? &DLC : nullptr)) + return false; + } else { + if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC, + IsGFX10 ? &DLC : nullptr)) + return false; + } + + int NumVAddrRegs = 0; + int NumVAddrDwords = 0; + for (int I = 0; I < NumVAddr; ++I) { + // Skip the $noregs and 0s inserted during legalization. + MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I); + if (!AddrOp.isReg()) + continue; // XXX - Break? + + Register Addr = AddrOp.getReg(); + if (!Addr) + break; + + ++NumVAddrRegs; + NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; + } + + // The legalizer preprocessed the intrinsic arguments. If we aren't using + // NSA, these should have beeen packed into a single value in the first + // address register + const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; + if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { + LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); + return false; + } + + if (IsTexFail) + ++NumVDataDwords; + + int Opcode = -1; + if (IsGFX10) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx10NSA + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, NumVAddrDwords); + } else { + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + } + assert(Opcode != -1); + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) + .cloneMemRefs(MI); + + if (VDataOut) { + if (BaseOpcode->AtomicX2) { + const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; + + Register TmpReg = MRI->createVirtualRegister( + Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); + unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + + MIB.addDef(TmpReg); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) + .addReg(TmpReg, RegState::Kill, SubReg); + + } else { + MIB.addDef(VDataOut); // vdata output + } } - case Intrinsic::amdgcn_raw_buffer_store: - return selectStoreIntrinsic(I, false); - case Intrinsic::amdgcn_raw_buffer_store_format: - return selectStoreIntrinsic(I, true); + + if (VDataIn) + MIB.addReg(VDataIn); // vdata input + + for (int i = 0; i != NumVAddrRegs; ++i) { + MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i); + if (SrcOp.isReg()) { + assert(SrcOp.getReg() != 0); + MIB.addReg(SrcOp.getReg()); + } + } + + MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc + if (BaseOpcode->Sampler) + MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler + + MIB.addImm(DMask); // dmask + + if (IsGFX10) + MIB.addImm(DimInfo->Encoding); + MIB.addImm(Unorm); + if (IsGFX10) + MIB.addImm(DLC); + + MIB.addImm(GLC); + MIB.addImm(SLC); + MIB.addImm(IsA16 && // a16 or r128 + STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); + if (IsGFX10) + MIB.addImm(IsA16 ? -1 : 0); + + MIB.addImm(TFE); // tfe + MIB.addImm(LWE); // lwe + if (!IsGFX10) + MIB.addImm(DimInfo->DA ? -1 : 0); + if (BaseOpcode->HasD16) + MIB.addImm(IsD16 ? -1 : 0); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + +bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( + MachineInstr &I) const { + unsigned IntrinsicID = I.getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_end_cf: + return selectEndCfIntrinsic(I); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: return selectDSOrderedIntrinsic(I, IntrinsicID); - default: + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: + return selectDSGWSIntrinsic(I, IntrinsicID); + case Intrinsic::amdgcn_ds_append: + return selectDSAppendConsume(I, true); + case Intrinsic::amdgcn_ds_consume: + return selectDSAppendConsume(I, false); + default: { return selectImpl(I, *CoverageInfo); } + } } bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { + if (selectImpl(I, *CoverageInfo)) + return true; + MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); @@ -1247,9 +1701,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI->getType(DstReg); const LLT SrcTy = MRI->getType(SrcReg); - if (!DstTy.isScalar()) - return false; - const LLT S1 = LLT::scalar(1); const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -1264,6 +1715,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { return false; } + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); @@ -1271,6 +1724,73 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI); const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); + if (!SrcRC || !DstRC) + return false; + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + + Register LoReg = MRI->createVirtualRegister(DstRC); + Register HiReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(SrcReg, 0, AMDGPU::sub0); + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(SrcReg, 0, AMDGPU::sub1); + + if (IsVALU && STI.hasSDWA()) { + // Write the low 16-bits of the high element into the high 16-bits of the + // low element. + MachineInstr *MovSDWA = + BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(HiReg) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel + .addReg(LoReg, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg0 = MRI->createVirtualRegister(DstRC); + Register TmpReg1 = MRI->createVirtualRegister(DstRC); + Register ImmReg = MRI->createVirtualRegister(DstRC); + if (IsVALU) { + BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) + .addImm(16) + .addReg(HiReg); + } else { + BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) + .addReg(HiReg) + .addImm(16); + } + + unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; + unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; + + BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) + .addReg(LoReg) + .addReg(ImmReg); + BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) + .addReg(TmpReg0) + .addReg(TmpReg1); + } + + I.eraseFromParent(); + return true; + } + + if (!DstTy.isScalar()) + return false; if (SrcSize > 32) { int SubRegIdx = sizeToSubRegIndex(DstSize); @@ -1279,17 +1799,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { // Deal with weird cases where the class only partially supports the subreg // index. - SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); - if (!SrcRC) + const TargetRegisterClass *SrcWithSubRC + = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); + if (!SrcWithSubRC) return false; - I.getOperand(1).setSubReg(SubRegIdx); - } + if (SrcWithSubRC != SrcRC) { + if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) + return false; + } - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); - return false; + I.getOperand(1).setSubReg(SubRegIdx); } I.setDesc(TII.get(TargetOpcode::COPY)); @@ -1318,7 +1838,8 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( } bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { - bool Signed = I.getOpcode() == AMDGPU::G_SEXT; + bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; + bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; const DebugLoc &DL = I.getDebugLoc(); MachineBasicBlock &MBB = *I.getParent(); const Register DstReg = I.getOperand(0).getReg(); @@ -1326,7 +1847,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { const LLT DstTy = MRI->getType(DstReg); const LLT SrcTy = MRI->getType(SrcReg); - const unsigned SrcSize = SrcTy.getSizeInBits(); + const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? + I.getOperand(2).getImm() : SrcTy.getSizeInBits(); const unsigned DstSize = DstTy.getSizeInBits(); if (!DstTy.isScalar()) return false; @@ -1362,7 +1884,9 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { } if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { - if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) + const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? + AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; + if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) return false; if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { @@ -1378,13 +1902,15 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. - if (DstSize > 32 && SrcSize <= 32) { + if (DstSize > 32 && (SrcSize <= 32 || InReg)) { // We need a 64-bit register source, but the high bits don't matter. Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubReg = InReg ? AMDGPU::sub0 : 0; + BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) - .addReg(SrcReg) + .addReg(SrcReg, 0, SubReg) .addImm(AMDGPU::sub0) .addReg(UndefReg) .addImm(AMDGPU::sub1); @@ -1487,6 +2013,103 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); } +bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { + // Only manually handle the f64 SGPR case. + // + // FIXME: This is a workaround for 2.5 different tablegen problems. Because + // the bit ops theoretically have a second result due to the implicit def of + // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing + // that is easy by disabling the check. The result works, but uses a + // nonsensical sreg32orlds_and_sreg_1 regclass. + // + // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to + // the variadic REG_SEQUENCE operands. + + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); + if (DstRB->getID() != AMDGPU::SGPRRegBankID || + MRI->getType(Dst) != LLT::scalar(64)) + return false; + + Register Src = MI.getOperand(1).getReg(); + MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); + if (Fabs) + Src = Fabs->getOperand(1).getReg(); + + if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) + return false; + + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(Src, 0, AMDGPU::sub0); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(Src, 0, AMDGPU::sub1); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) + .addImm(0x80000000); + + // Set or toggle sign bit. + unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; + BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) + .addReg(HiReg) + .addReg(ConstReg); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(OpReg) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return true; +} + +// FIXME: This is a workaround for the same tablegen problems as G_FNEG +bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); + if (DstRB->getID() != AMDGPU::SGPRRegBankID || + MRI->getType(Dst) != LLT::scalar(64)) + return false; + + Register Src = MI.getOperand(1).getReg(); + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + + if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || + !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) + return false; + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(Src, 0, AMDGPU::sub0); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(Src, 0, AMDGPU::sub1); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) + .addImm(0x7fffffff); + + // Clear sign bit. + // TODO: Should this used S_BITSET0_*? + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) + .addReg(HiReg) + .addReg(ConstReg); + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(OpReg) + .addImm(AMDGPU::sub1); + + MI.eraseFromParent(); + return true; +} + static bool isConstant(const MachineInstr &MI) { return MI.getOpcode() == TargetOpcode::G_CONSTANT; } @@ -1573,6 +2196,65 @@ bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const { return selectImpl(I, *CoverageInfo); } +// TODO: No rtn optimization. +bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG( + MachineInstr &MI) const { + Register PtrReg = MI.getOperand(1).getReg(); + const LLT PtrTy = MRI->getType(PtrReg); + if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + STI.useFlatForGlobal()) + return selectImpl(MI, *CoverageInfo); + + Register DstReg = MI.getOperand(0).getReg(); + const LLT Ty = MRI->getType(DstReg); + const bool Is64 = Ty.getSizeInBits() == 64; + const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + Register TmpReg = MRI->createVirtualRegister( + Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock *BB = MI.getParent(); + + Register VAddr, RSrcReg, SOffset; + int64_t Offset = 0; + + unsigned Opcode; + if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) { + Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN; + } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr, + RSrcReg, SOffset, Offset)) { + Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN; + } else + return selectImpl(MI, *CoverageInfo); + + auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg) + .addReg(MI.getOperand(2).getReg()); + + if (VAddr) + MIB.addReg(VAddr); + + MIB.addReg(RSrcReg); + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + + MIB.addImm(Offset); + MIB.addImm(0); // slc + MIB.cloneMemRefs(MI); + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(TmpReg, RegState::Kill, SubReg); + + MI.eraseFromParent(); + + MRI->setRegClass( + DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineOperand &CondOp = I.getOperand(0); @@ -1619,7 +2301,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE( + MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; @@ -1631,67 +2314,134 @@ bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const { DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); } -bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const { - uint64_t Align = I.getOperand(2).getImm(); - const uint64_t Mask = ~((UINT64_C(1) << Align) - 1); - - MachineBasicBlock *BB = I.getParent(); - +bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); Register SrcReg = I.getOperand(1).getReg(); + Register MaskReg = I.getOperand(2).getReg(); + LLT Ty = MRI->getType(DstReg); + LLT MaskTy = MRI->getType(MaskReg); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); + const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; + if (DstRB != SrcRB) // Should only happen for hand written MIR. + return false; + unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; - unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; const TargetRegisterClass &RegRC = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; - LLT Ty = MRI->getType(DstReg); - const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB, *MRI); const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB, *MRI); + const TargetRegisterClass *MaskRC = + TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI); + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || - !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || + !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) return false; + MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - Register ImmReg = MRI->createVirtualRegister(&RegRC); - BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg) - .addImm(Mask); - if (Ty.getSizeInBits() == 32) { + assert(MaskTy.getSizeInBits() == 32 && + "ptrmask should have been narrowed during legalize"); + BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) .addReg(SrcReg) - .addReg(ImmReg); + .addReg(MaskReg); I.eraseFromParent(); return true; } Register HiReg = MRI->createVirtualRegister(&RegRC); Register LoReg = MRI->createVirtualRegister(&RegRC); - Register MaskLo = MRI->createVirtualRegister(&RegRC); + // Extract the subregisters from the source pointer. BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) .addReg(SrcReg, 0, AMDGPU::sub0); BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) .addReg(SrcReg, 0, AMDGPU::sub1); - BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo) - .addReg(LoReg) - .addReg(ImmReg); + Register MaskedLo, MaskedHi; + + // Try to avoid emitting a bit operation when we only need to touch half of + // the 64-bit pointer. + APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); + + const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); + const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); + if ((MaskOnes & MaskLo32) == MaskLo32) { + // If all the bits in the low half are 1, we only need a copy for it. + MaskedLo = LoReg; + } else { + // Extract the mask subregister and apply the and. + Register MaskLo = MRI->createVirtualRegister(&RegRC); + MaskedLo = MRI->createVirtualRegister(&RegRC); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) + .addReg(MaskReg, 0, AMDGPU::sub0); + BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) + .addReg(LoReg) + .addReg(MaskLo); + } + + if ((MaskOnes & MaskHi32) == MaskHi32) { + // If all the bits in the high half are 1, we only need a copy for it. + MaskedHi = HiReg; + } else { + Register MaskHi = MRI->createVirtualRegister(&RegRC); + MaskedHi = MRI->createVirtualRegister(&RegRC); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) + .addReg(MaskReg, 0, AMDGPU::sub1); + BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) + .addReg(HiReg) + .addReg(MaskHi); + } + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(MaskLo) + .addReg(MaskedLo) .addImm(AMDGPU::sub0) - .addReg(HiReg) + .addReg(MaskedHi) .addImm(AMDGPU::sub1); I.eraseFromParent(); return true; } +/// Return the register to use for the index value, and the subregister to use +/// for the indirectly accessed register. +static std::pair<Register, unsigned> +computeIndirectRegIndex(MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI, + const TargetRegisterClass *SuperRC, + Register IdxReg, + unsigned EltSize) { + Register IdxBaseReg; + int Offset; + MachineInstr *Unused; + + std::tie(IdxBaseReg, Offset, Unused) + = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg); + if (IdxBaseReg == AMDGPU::NoRegister) { + // This will happen if the index is a known constant. This should ordinarily + // be legalized out, but handle it as a register just in case. + assert(Offset == 0); + IdxBaseReg = IdxReg; + } + + ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); + + // Skip out of bounds offsets, or else we would end up using an undefined + // register. + if (static_cast<unsigned>(Offset) >= SubRegs.size()) + return std::make_pair(IdxReg, SubRegs[0]); + return std::make_pair(IdxBaseReg, SubRegs[Offset]); +} + bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( MachineInstr &MI) const { Register DstReg = MI.getOperand(0).getReg(); @@ -1714,6 +2464,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( *MRI); const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB, *MRI); + if (!SrcRC || !DstRC) + return false; if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) @@ -1723,7 +2475,9 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( const DebugLoc &DL = MI.getDebugLoc(); const bool Is64 = DstTy.getSizeInBits() == 64; - unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubReg; + std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg, + DstTy.getSizeInBits() / 8); if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { if (DstTy.getSizeInBits() != 32 && !Is64) @@ -1766,6 +2520,237 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( return true; } +// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd +bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( + MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register VecReg = MI.getOperand(1).getReg(); + Register ValReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + + LLT VecTy = MRI->getType(DstReg); + LLT ValTy = MRI->getType(ValReg); + unsigned VecSize = VecTy.getSizeInBits(); + unsigned ValSize = ValTy.getSizeInBits(); + + const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); + const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); + const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); + + assert(VecTy.getElementType() == ValTy); + + // The index must be scalar. If it wasn't RegBankSelect should have moved this + // into a waterfall loop. + if (IdxRB->getID() != AMDGPU::SGPRRegBankID) + return false; + + const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, + *MRI); + const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, + *MRI); + + if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || + !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || + !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) + return false; + + unsigned SubReg; + std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, + ValSize / 8); + + const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && + STI.useVGPRIndexMode(); + + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (IndexMode) { + BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(IdxReg) + .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); + } else { + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(IdxReg); + } + + const MCInstrDesc &RegWriteOp + = TII.getIndirectRegWritePseudo(VecSize, ValSize, + VecRB->getID() == AMDGPU::SGPRRegBankID); + BuildMI(*BB, MI, DL, RegWriteOp, DstReg) + .addReg(VecReg) + .addReg(ValReg) + .addImm(SubReg); + + if (IndexMode) + BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); + + MI.eraseFromParent(); + return true; +} + +static bool isZeroOrUndef(int X) { + return X == 0 || X == -1; +} + +static bool isOneOrUndef(int X) { + return X == 1 || X == -1; +} + +static bool isZeroOrOneOrUndef(int X) { + return X == 0 || X == 1 || X == -1; +} + +// Normalize a VOP3P shuffle mask to refer to the low/high half of a single +// 32-bit register. +static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, + ArrayRef<int> Mask) { + NewMask[0] = Mask[0]; + NewMask[1] = Mask[1]; + if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1])) + return Src0; + + assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1); + assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1); + + // Shift the mask inputs to be 0/1; + NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2; + NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2; + return Src1; +} + +// This is only legal with VOP3P instructions as an aid to op_sel matching. +bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR( + MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register Src0Reg = MI.getOperand(1).getReg(); + Register Src1Reg = MI.getOperand(2).getReg(); + ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask(); + + const LLT V2S16 = LLT::vector(2, 16); + if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16) + return false; + + if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask)) + return false; + + assert(ShufMask.size() == 2); + assert(STI.hasSDWA() && "no target has VOP3P but not SDWA"); + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + const TargetRegisterClass &RC = IsVALU ? + AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + + // Handle the degenerate case which should have folded out. + if (ShufMask[0] == -1 && ShufMask[1] == -1) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg); + + MI.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, RC, *MRI); + } + + // A legal VOP3P mask only reads one of the sources. + int Mask[2]; + Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask); + + if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) || + !RBI.constrainGenericRegister(SrcVec, RC, *MRI)) + return false; + + // TODO: This also should have been folded out + if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(SrcVec); + + MI.eraseFromParent(); + return true; + } + + if (Mask[0] == 1 && Mask[1] == -1) { + if (IsVALU) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) + .addImm(16) + .addReg(SrcVec); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) + .addReg(SrcVec) + .addImm(16); + } + } else if (Mask[0] == -1 && Mask[1] == 0) { + if (IsVALU) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg) + .addImm(16) + .addReg(SrcVec); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg) + .addReg(SrcVec) + .addImm(16); + } + } else if (Mask[0] == 0 && Mask[1] == 0) { + if (IsVALU) { + // Write low half of the register into the high half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) + .addReg(SrcVec) + .addReg(SrcVec); + } + } else if (Mask[0] == 1 && Mask[1] == 1) { + if (IsVALU) { + // Write high half of the register into the low half. + MachineInstr *MovSDWA = + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(SrcVec) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel + .addReg(SrcVec, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg) + .addReg(SrcVec) + .addReg(SrcVec); + } + } else if (Mask[0] == 1 && Mask[1] == 0) { + if (IsVALU) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg) + .addReg(SrcVec) + .addReg(SrcVec) + .addImm(16); + } else { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg) + .addReg(SrcVec) + .addImm(16); + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg) + .addReg(TmpReg) + .addReg(SrcVec); + } + } else + llvm_unreachable("all shuffle masks should be handled"); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -1780,9 +2765,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: - if (selectG_AND_OR_XOR(I)) + if (selectImpl(I, *CoverageInfo)) return true; - return selectImpl(I, *CoverageInfo); + return selectG_AND_OR_XOR(I); case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: if (selectImpl(I, *CoverageInfo)) @@ -1800,6 +2785,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: return selectG_CONSTANT(I); + case TargetOpcode::G_FNEG: + if (selectImpl(I, *CoverageInfo)) + return true; + return selectG_FNEG(I); + case TargetOpcode::G_FABS: + if (selectImpl(I, *CoverageInfo)) + return true; + return selectG_FABS(I); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); case TargetOpcode::G_MERGE_VALUES: @@ -1808,6 +2801,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectG_MERGE_VALUES(I); case TargetOpcode::G_UNMERGE_VALUES: return selectG_UNMERGE_VALUES(I); + case TargetOpcode::G_BUILD_VECTOR_TRUNC: + return selectG_BUILD_VECTOR_TRUNC(I); case TargetOpcode::G_PTR_ADD: return selectG_PTR_ADD(I); case TargetOpcode::G_IMPLICIT_DEF: @@ -1836,6 +2831,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_UMAX: case TargetOpcode::G_ATOMICRMW_FADD: return selectG_LOAD_ATOMICRMW(I); + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: + return selectG_AMDGPU_ATOMIC_CMPXCHG(I); case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_STORE: @@ -1845,17 +2842,34 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SEXT: case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: + case TargetOpcode::G_SEXT_INREG: if (selectImpl(I, *CoverageInfo)) return true; return selectG_SZA_EXT(I); case TargetOpcode::G_BRCOND: return selectG_BRCOND(I); case TargetOpcode::G_FRAME_INDEX: - return selectG_FRAME_INDEX(I); - case TargetOpcode::G_PTR_MASK: - return selectG_PTR_MASK(I); + case TargetOpcode::G_GLOBAL_VALUE: + return selectG_FRAME_INDEX_GLOBAL_VALUE(I); + case TargetOpcode::G_PTRMASK: + return selectG_PTRMASK(I); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return selectG_EXTRACT_VECTOR_ELT(I); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return selectG_INSERT_VECTOR_ELT(I); + case TargetOpcode::G_SHUFFLE_VECTOR: + return selectG_SHUFFLE_VECTOR(I); + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: + initM0(I); + return selectImpl(I, *CoverageInfo); + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + const AMDGPU::ImageDimIntrinsicInfo *Intr + = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); + assert(Intr && "not an image intrinsic with image pseudo"); + return selectImageIntrinsic(I, Intr); + } default: return selectImpl(I, *CoverageInfo); } @@ -1871,15 +2885,16 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } std::pair<Register, unsigned> -AMDGPUInstructionSelector::selectVOP3ModsImpl( - Register Src) const { +AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { + Register Src = Root.getReg(); + Register OrigSrc = Src; unsigned Mods = 0; - MachineInstr *MI = MRI->getVRegDef(Src); + MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; - MI = MRI->getVRegDef(Src); + MI = getDefIgnoringCopies(Src, *MRI); } if (MI && MI->getOpcode() == AMDGPU::G_FABS) { @@ -1887,6 +2902,20 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl( Mods |= SISrcMods::ABS; } + if (Mods != 0 && + RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { + MachineInstr *UseMI = Root.getParent(); + + // If we looked through copies to find source modifiers on an SGPR operand, + // we now have an SGPR register source. To avoid potentially violating the + // constant bus restriction, we need to insert a copy to a VGPR. + Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc); + BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(), + TII.get(AMDGPU::COPY), VGPRSrc) + .addReg(Src); + Src = VGPRSrc; + } + return std::make_pair(Src, Mods); } @@ -1904,7 +2933,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1927,7 +2956,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1936,12 +2965,48 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { + Register Reg = Root.getReg(); + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || + Def->getOpcode() == AMDGPU::G_FABS)) + return {}; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, + }}; +} + +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsImpl( + Register Src, const MachineRegisterInfo &MRI) const { + unsigned Mods = 0; + MachineInstr *MI = MRI.getVRegDef(Src); + + if (MI && MI->getOpcode() == AMDGPU::G_FNEG && + // It's possible to see an f32 fneg here, but unlikely. + // TODO: Treat f32 fneg as only high bit. + MRI.getType(Src) == LLT::vector(2, 16)) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + + // Packed instructions do not have abs modifiers. + Mods |= SISrcMods::OP_SEL_1; + + return std::make_pair(Src, Mods); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); - if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) - return None; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, @@ -1950,12 +3015,16 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const { - // FIXME: Handle clamp and op_sel +AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI)) + return None; + return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } @@ -1977,15 +3046,15 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { return None; const GEPInfo &GEPInfo = AddrInfo[0]; - - if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) + Optional<int64_t> EncodedImm = + AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false); + if (!EncodedImm) return None; unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; } @@ -1998,14 +3067,15 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { return None; const GEPInfo &GEPInfo = AddrInfo[0]; - unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); - if (!isUInt<32>(EncodedImm)) + Register PtrReg = GEPInfo.SgprParts[0]; + Optional<int64_t> EncodedImm = + AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); + if (!EncodedImm) return None; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; } @@ -2023,14 +3093,15 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { return None; const GEPInfo &GEPInfo = AddrInfo[0]; - if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + // SGPR offset is unsigned. + if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) return None; // If we make it this far we have a load with an 32-bit immediate offset. // It is OK to select this using a sgpr offset, because we have already // failed trying to select this load into one of the _IMM variants since // the _IMM Patterns are considered before the _SGPR patterns. - unsigned PtrReg = GEPInfo.SgprParts[0]; + Register PtrReg = GEPInfo.SgprParts[0]; Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(GEPInfo.Imm); @@ -2099,7 +3170,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); int64_t Offset = 0; - if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) { + if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && + Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); // TODO: Should this be inside the render function? The iterator seems to @@ -2118,17 +3190,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - MIB.addReg(SOffsetReg); + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset & 4095); }}}; } - assert(Offset == 0); + assert(Offset == 0 || Offset == -1); // Try to fold a frame index directly into the MUBUF vaddr field, and any // offsets. @@ -2158,13 +3230,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { } } - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - // TODO: Should split large offsets that don't fit like above. - // TODO: Don't use scratch wave offset just because the offset didn't fit. - Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, @@ -2175,15 +3240,22 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MIB.addReg(VAddr); }, [=](MachineInstrBuilder &MIB) { // soffset - MIB.addReg(SOffset); + // If we don't know this private access is a local stack object, it + // needs to be relative to the entry point's scratch wave offset. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset + // didn't fit. + if (!Info->isEntryFunction() && FI.hasValue()) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); }}}; } -bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, - const MachineOperand &Base, +bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, int64_t Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || @@ -2195,7 +3267,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, // On Southern Islands instruction with a negative base value and an offset // don't seem to work. - return KnownBits->signBitIsZero(Base.getReg()); + return KnownBits->signBitIsZero(Base); } InstructionSelector::ComplexRendererFns @@ -2214,68 +3286,485 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); return {{ - [=](MachineInstrBuilder &MIB) { + [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); - }, // rsrc - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { + const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); + if (!RootDef) + return std::make_pair(Root.getReg(), 0); + + int64_t ConstAddr = 0; + + Register PtrBase; + int64_t Offset; + std::tie(PtrBase, Offset) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + + if (Offset) { + if (isDSOffsetLegal(PtrBase, Offset, 16)) { + // (add n0, c0) + return std::make_pair(PtrBase, Offset); + } + } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { + // TODO + + + } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { + // TODO + + } + + return std::make_pair(Root.getReg(), 0); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { + Register Reg; + unsigned Offset; + std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { + Register Reg; + unsigned Offset; + std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } + }}; +} + +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); - if (!RootDef) { - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } - }}; - } + if (!RootDef) + return std::make_pair(Root.getReg(), 0); int64_t ConstAddr = 0; - if (isBaseWithConstantOffset(Root, *MRI)) { - const MachineOperand &LHS = RootDef->getOperand(1); - const MachineOperand &RHS = RootDef->getOperand(2); - const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg()); - const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg()); - if (LHSDef && RHSDef) { - int64_t PossibleOffset = - RHSDef->getOperand(1).getCImm()->getSExtValue(); - if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { - // (add n0, c0) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } - }}; - } + + Register PtrBase; + int64_t Offset; + std::tie(PtrBase, Offset) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + + if (Offset) { + int64_t DWordOffset0 = Offset / 4; + int64_t DWordOffset1 = DWordOffset0 + 1; + if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { + // (add n0, c0) + return std::make_pair(PtrBase, DWordOffset0); } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { + // TODO + } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { + // TODO + } - } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { + return std::make_pair(Root.getReg(), 0); +} + +/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return +/// the base value with the constant offset. There may be intervening copies +/// between \p Root and the identified constant. Returns \p Root, 0 if this does +/// not match the pattern. +std::pair<Register, int64_t> +AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( + Register Root, const MachineRegisterInfo &MRI) const { + MachineInstr *RootI = MRI.getVRegDef(Root); + if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) + return {Root, 0}; + + MachineOperand &RHS = RootI->getOperand(2); + Optional<ValueAndVReg> MaybeOffset + = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true); + if (!MaybeOffset) + return {Root, 0}; + return {RootI->getOperand(1).getReg(), MaybeOffset->Value}; +} + +static void addZeroImm(MachineInstrBuilder &MIB) { + MIB.addImm(0); +} + +/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p +/// BasePtr is not valid, a null base pointer will be used. +static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, + uint32_t FormatLo, uint32_t FormatHi, + Register BasePtr) { + Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); + + B.buildInstr(AMDGPU::S_MOV_B32) + .addDef(RSrc2) + .addImm(FormatLo); + B.buildInstr(AMDGPU::S_MOV_B32) + .addDef(RSrc3) + .addImm(FormatHi); + + // Build the half of the subregister with the constants before building the + // full 128-bit register. If we are building multiple resource descriptors, + // this will allow CSEing of the 2-component register. + B.buildInstr(AMDGPU::REG_SEQUENCE) + .addDef(RSrcHi) + .addReg(RSrc2) + .addImm(AMDGPU::sub0) + .addReg(RSrc3) + .addImm(AMDGPU::sub1); + + Register RSrcLo = BasePtr; + if (!BasePtr) { + RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + B.buildInstr(AMDGPU::S_MOV_B64) + .addDef(RSrcLo) + .addImm(0); + } + + B.buildInstr(AMDGPU::REG_SEQUENCE) + .addDef(RSrc) + .addReg(RSrcLo) + .addImm(AMDGPU::sub0_sub1) + .addReg(RSrcHi) + .addImm(AMDGPU::sub2_sub3); + + return RSrc; +} + +static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, + const SIInstrInfo &TII, Register BasePtr) { + uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); + + // FIXME: Why are half the "default" bits ignored based on the addressing + // mode? + return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); +} + +static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, + const SIInstrInfo &TII, Register BasePtr) { + uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); + + // FIXME: Why are half the "default" bits ignored based on the addressing + // mode? + return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); +} + +AMDGPUInstructionSelector::MUBUFAddressData +AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { + MUBUFAddressData Data; + Data.N0 = Src; + + Register PtrBase; + int64_t Offset; + + std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); + if (isUInt<32>(Offset)) { + Data.N0 = PtrBase; + Data.Offset = Offset; + } + + if (MachineInstr *InputAdd + = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { + Data.N2 = InputAdd->getOperand(1).getReg(); + Data.N3 = InputAdd->getOperand(2).getReg(); + + // FIXME: Need to fix extra SGPR->VGPRcopies inserted + // FIXME: Don't know this was defined by operand 0 + // + // TODO: Remove this when we have copy folding optimizations after + // RegBankSelect. + Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); + Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); + } + + return Data; +} + +/// Return if the addr64 mubuf mode should be used for the given address. +bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { + // (ptr_add N2, N3) -> addr64, or + // (ptr_add (ptr_add N2, N3), C1) -> addr64 + if (Addr.N2) + return true; + + const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); + return N0Bank->getID() == AMDGPU::VGPRRegBankID; +} +/// Split an immediate offset \p ImmOffset depending on whether it fits in the +/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable +/// component. +void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( + MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { + if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset)) + return; + + // Illegal offset, store it in soffset. + SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + B.buildInstr(AMDGPU::S_MOV_B32) + .addDef(SOffset) + .addImm(ImmOffset); + ImmOffset = 0; +} +bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( + MachineOperand &Root, Register &VAddr, Register &RSrcReg, + Register &SOffset, int64_t &Offset) const { + // FIXME: Predicates should stop this from reaching here. + // addr64 bit was removed for volcanic islands. + if (!STI.hasAddr64() || STI.useFlatForGlobal()) + return false; + + MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); + if (!shouldUseAddr64(AddrData)) + return false; + + Register N0 = AddrData.N0; + Register N2 = AddrData.N2; + Register N3 = AddrData.N3; + Offset = AddrData.Offset; + + // Base pointer for the SRD. + Register SRDPtr; + + if (N2) { + if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { + assert(N3); + if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { + // Both N2 and N3 are divergent. Use N0 (the result of the add) as the + // addr64, and construct the default resource from a 0 address. + VAddr = N0; + } else { + SRDPtr = N3; + VAddr = N2; + } + } else { + // N2 is not divergent. + SRDPtr = N2; + VAddr = N3; + } + } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { + // Use the default null pointer in the resource + VAddr = N0; + } else { + // N0 -> offset, or + // (N0 + C1) -> offset + SRDPtr = N0; } + MachineIRBuilder B(*Root.getParent()); + RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); + splitIllegalMUBUFOffset(B, SOffset, Offset); + return true; +} + +bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( + MachineOperand &Root, Register &RSrcReg, Register &SOffset, + int64_t &Offset) const { + MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); + if (shouldUseAddr64(AddrData)) + return false; + + // N0 -> offset, or + // (N0 + C1) -> offset + Register SRDPtr = AddrData.N0; + Offset = AddrData.Offset; + + // TODO: Look through extensions for 32-bit soffset. + MachineIRBuilder B(*Root.getParent()); + + RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); + splitIllegalMUBUFOffset(B, SOffset, Offset); + return true; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { + Register VAddr; + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) + return {}; + + // FIXME: Use defaulted operands for trailing 0s and remove from the complex + // pattern. return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + MIB.addReg(VAddr); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset); + }, + addZeroImm, // glc + addZeroImm, // slc + addZeroImm, // tfe + addZeroImm, // dlc + addZeroImm // swz + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset + addZeroImm, // glc + addZeroImm, // slc + addZeroImm, // tfe + addZeroImm, // dlc + addZeroImm // swz + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { + Register VAddr; + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) + return {}; + + // FIXME: Use defaulted operands for trailing 0s and remove from the complex + // pattern. + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + MIB.addReg(VAddr); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset); + }, + addZeroImm // slc }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset + addZeroImm // slc + }}; +} + +/// Get an immediate that must be 32-bits, and treated as zero extended. +static Optional<uint64_t> getConstantZext32Val(Register Reg, + const MachineRegisterInfo &MRI) { + // getConstantVRegVal sexts any values, so see if that matters. + Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI); + if (!OffsetVal || !isInt<32>(*OffsetVal)) + return None; + return Lo_32(*OffsetVal); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { + Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); + if (!OffsetVal) + return {}; + + Optional<int64_t> EncodedImm = + AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); + if (!EncodedImm) + return {}; + + return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { + assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); + + Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); + if (!OffsetVal) + return {}; + + Optional<int64_t> EncodedImm + = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); + if (!EncodedImm) + return {}; + + return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); - Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI); - assert(CstVal && "Expected constant value"); - MIB.addImm(CstVal.getValue()); + MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); } void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, @@ -2316,6 +3805,34 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, MIB.addImm(MI.getOperand(OpIdx).getImm()); } +void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm(MI.getOperand(OpIdx).getImm() & 1); +} + +void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1); +} + +void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1); +} + +void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); +} + bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const { return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 38ca7fd4104bb..1fe80958917d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -31,6 +31,10 @@ namespace { namespace llvm { +namespace AMDGPU { +struct ImageDimIntrinsicInfo; +} + class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class GCNSubtarget; @@ -80,28 +84,39 @@ private: MachineOperand getSubOperand64(MachineOperand &MO, const TargetRegisterClass &SubRC, unsigned SubIdx) const; + + bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const; bool selectCOPY(MachineInstr &I) const; bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; + bool selectG_FNEG(MachineInstr &I) const; + bool selectG_FABS(MachineInstr &I) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; bool selectG_EXTRACT(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; + bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const; bool selectG_PTR_ADD(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; - bool selectG_INTRINSIC(MachineInstr &I) const; - std::tuple<Register, unsigned, unsigned> - splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + bool selectInterpP1F16(MachineInstr &MI) const; + bool selectDivScale(MachineInstr &MI) const; + bool selectIntrinsicIcmp(MachineInstr &MI) const; + bool selectBallot(MachineInstr &I) const; + bool selectG_INTRINSIC(MachineInstr &I) const; - bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const; + bool selectEndCfIntrinsic(MachineInstr &MI) const; bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; + bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; + bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; + bool selectImageIntrinsic(MachineInstr &MI, + const AMDGPU::ImageDimIntrinsicInfo *Intr) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; bool selectG_ICMP(MachineInstr &I) const; @@ -112,15 +127,18 @@ private: void initM0(MachineInstr &I) const; bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const; + bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; bool selectG_SELECT(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; - bool selectG_FRAME_INDEX(MachineInstr &I) const; - bool selectG_PTR_MASK(MachineInstr &I) const; + bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const; + bool selectG_PTRMASK(MachineInstr &I) const; bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; + bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; + bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; std::pair<Register, unsigned> - selectVOP3ModsImpl(Register Src) const; + selectVOP3ModsImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -134,11 +152,18 @@ private: selectVOP3OMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + + ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3Mods_nnan(MachineOperand &Root) const; + std::pair<Register, unsigned> + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + InstructionSelector::ComplexRendererFns - selectVOP3OpSelMods0(MachineOperand &Root) const; + selectVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; @@ -163,19 +188,86 @@ private: InstructionSelector::ComplexRendererFns selectMUBUFScratchOffset(MachineOperand &Root) const; - bool isDSOffsetLegal(const MachineRegisterInfo &MRI, - const MachineOperand &Base, - int64_t Offset, unsigned OffsetBits) const; + bool isDSOffsetLegal(Register Base, int64_t Offset, + unsigned OffsetBits) const; + std::pair<Register, unsigned> + selectDS1Addr1OffsetImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectDS1Addr1Offset(MachineOperand &Root) const; + std::pair<Register, unsigned> + selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectDS64Bit4ByteAligned(MachineOperand &Root) const; + + std::pair<Register, int64_t> + getPtrBaseWithConstantOffset(Register Root, + const MachineRegisterInfo &MRI) const; + + // Parse out a chain of up to two g_ptr_add instructions. + // g_ptr_add (n0, _) + // g_ptr_add (n0, (n1 = g_ptr_add n2, n3)) + struct MUBUFAddressData { + Register N0, N2, N3; + int64_t Offset = 0; + }; + + bool shouldUseAddr64(MUBUFAddressData AddrData) const; + + void splitIllegalMUBUFOffset(MachineIRBuilder &B, + Register &SOffset, int64_t &ImmOffset) const; + + MUBUFAddressData parseMUBUFAddress(Register Src) const; + + bool selectMUBUFAddr64Impl(MachineOperand &Root, Register &VAddr, + Register &RSrcReg, Register &SOffset, + int64_t &Offset) const; + + bool selectMUBUFOffsetImpl(MachineOperand &Root, Register &RSrcReg, + Register &SOffset, int64_t &Offset) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFAddr64(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFOffset(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFOffsetAtomic(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFAddr64Atomic(MachineOperand &Root) const; + + ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const; + ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const; + void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const { + renderTruncTImm(MIB, MI, OpIdx); + } + + void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const { + renderTruncTImm(MIB, MI, OpIdx); + } + + void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const { + renderTruncTImm(MIB, MI, OpIdx); + } + + void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const { + renderTruncTImm(MIB, MI, OpIdx); + } + void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; @@ -184,6 +276,14 @@ private: void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; bool isInlineImmediate16(int64_t Imm) const; bool isInlineImmediate32(int64_t Imm) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7e71dbdd12408..5cb7ac320d2fb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -77,6 +77,9 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> def TruePredicate : Predicate<"">; +// FIXME: Tablegen should specially supports this +def FalsePredicate : Predicate<"false">; + // Add a predicate to the list if does not already exist to deduplicate it. class PredConcat<list<Predicate> lst, Predicate pred> { list<Predicate> ret = @@ -101,12 +104,12 @@ class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl; let RecomputePerFunction = 1 in { -def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; -def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">; -def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; -def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; -def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">; -def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; +def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; +def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">; +def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; +def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; +def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">; +def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } @@ -408,7 +411,12 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> { let IsAtomic = 1; let MemoryVT = i64; } +} // End let AddressSpaces +} // End foreach as + +foreach as = [ "global", "flat", "local", "private", "region" ] in { +let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in { def store_#as : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr)> { let IsStore = 1; @@ -444,8 +452,8 @@ def truncstorei16_hi16_#as : StoreHi16<truncstorei16>; defm atomic_store_#as : binary_atomic_op<atomic_store>; -} // End let AddressSpaces = ... -} // End foreach AddrSpace +} // End let AddressSpaces +} // End foreach as multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { @@ -520,7 +528,7 @@ class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; -int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP_4294966784 = 0x4f7ffffe; // 4294966784 = 4294967296 - 512 = 2^32 - 2^9 int FP16_ONE = 0x3C00; int FP16_NEG_ONE = 0xBC00; int FP32_ONE = 0x3f800000; @@ -731,6 +739,12 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> { >; } +// fshr pattern +class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat < + (fshr i32:$src0, i32:$src1, i32:$src2), + (BIT_ALIGN $src0, $src1, $src2) +>; + // rotr pattern class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < (rotr i32:$src0, i32:$src1), @@ -796,3 +810,13 @@ def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), [(fmaxnum_ieee_oneuse node:$src0, node:$src1), (fmaxnum_oneuse node:$src0, node:$src1)] >; + +def any_fmad : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(fmad node:$src0, node:$src1, node:$src2), + (AMDGPUfmad_ftz node:$src0, node:$src1, node:$src2)] +>; + +// FIXME: fsqrt should not select directly +def any_amdgcn_sqrt : PatFrags<(ops node:$src0), + [(fsqrt node:$src0), (int_amdgcn_sqrt node:$src0)] +>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 3f99d5cfb7f9a..2976794b49c3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -11,19 +11,16 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#if defined(_MSC_VER) || defined(__MINGW32__) -// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI -// from the Visual C++ cmath / math.h headers: -// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 -#define _USE_MATH_DEFINES -#endif +#include "AMDGPULegalizerInfo.h" #include "AMDGPU.h" -#include "AMDGPULegalizerInfo.h" +#include "AMDGPUGlobalISelUtils.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" @@ -37,21 +34,30 @@ using namespace llvm; using namespace LegalizeActions; using namespace LegalizeMutations; using namespace LegalityPredicates; - - -static LegalityPredicate isMultiple32(unsigned TypeIdx, - unsigned MaxSize = 1024) { - return [=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[TypeIdx]; - const LLT EltTy = Ty.getScalarType(); - return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; - }; +using namespace MIPatternMatch; + +// Hack until load/store selection patterns support any tuple of legal types. +static cl::opt<bool> EnableNewLegality( + "amdgpu-global-isel-new-legality", + cl::desc("Use GlobalISel desired legality, rather than try to use" + "rules compatible with selection patterns"), + cl::init(false), + cl::ReallyHidden); + +static constexpr unsigned MaxRegisterSize = 1024; + +// Round the number of elements to the next power of two elements +static LLT getPow2VectorType(LLT Ty) { + unsigned NElts = Ty.getNumElements(); + unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); + return Ty.changeNumElements(Pow2NElts); } -static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx].getSizeInBits() == Size; - }; +// Round the number of bits to the next power of two bits +static LLT getPow2ScalarType(LLT Ty) { + unsigned Bits = Ty.getSizeInBits(); + unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); + return LLT::scalar(Pow2Bits); } static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { @@ -109,6 +115,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned Size = Ty.getSizeInBits(); + + LLT CoercedTy; + if (Size <= 32) { + // <2 x s8> -> s16 + // <4 x s8> -> s32 + CoercedTy = LLT::scalar(Size); + } else + CoercedTy = LLT::scalarOrVector(Size / 32, 32); + + return std::make_pair(TypeIdx, CoercedTy); + }; +} + static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -130,25 +153,47 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of -// v2s16. +static bool isRegisterSize(unsigned Size) { + return Size % 32 == 0 && Size <= MaxRegisterSize; +} + +static bool isRegisterVectorElementType(LLT EltTy) { + const int EltSize = EltTy.getSizeInBits(); + return EltSize == 16 || EltSize % 32 == 0; +} + +static bool isRegisterVectorType(LLT Ty) { + const int EltSize = Ty.getElementType().getSizeInBits(); + return EltSize == 32 || EltSize == 64 || + (EltSize == 16 && Ty.getNumElements() % 2 == 0) || + EltSize == 128 || EltSize == 256; +} + +static bool isRegisterType(LLT Ty) { + if (!isRegisterSize(Ty.getSizeInBits())) + return false; + + if (Ty.isVector()) + return isRegisterVectorType(Ty); + + return true; +} + +// Any combination of 32 or 64-bit elements up the maximum register size, and +// multiples of v2s16. static LegalityPredicate isRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[TypeIdx]; - if (Ty.isVector()) { - const int EltSize = Ty.getElementType().getSizeInBits(); - return EltSize == 32 || EltSize == 64 || - (EltSize == 16 && Ty.getNumElements() % 2 == 0) || - EltSize == 128 || EltSize == 256; - } - - return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; + return isRegisterType(Query.Types[TypeIdx]); }; } -static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { +static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx].getElementType() == Type; + const LLT QueryTy = Query.Types[TypeIdx]; + if (!QueryTy.isVector()) + return false; + const LLT EltTy = QueryTy.getElementType(); + return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; }; } @@ -160,6 +205,120 @@ static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { }; } +// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we +// handle some operations by just promoting the register during +// selection. There are also d16 loads on GFX9+ which preserve the high bits. +static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, + bool IsLoad) { + switch (AS) { + case AMDGPUAS::PRIVATE_ADDRESS: + // FIXME: Private element size. + return 32; + case AMDGPUAS::LOCAL_ADDRESS: + return ST.useDS128() ? 128 : 64; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + // Treat constant and global as identical. SMRD loads are sometimes usable for + // global loads (ideally constant address space should be eliminated) + // depending on the context. Legality cannot be context dependent, but + // RegBankSelect can split the load as necessary depending on the pointer + // register bank/uniformity and if the memory is invariant or not written in a + // kernel. + return IsLoad ? 512 : 128; + default: + // Flat addresses may contextually need to be split to 32-bit parts if they + // may alias scratch depending on the subtarget. + return 128; + } +} + +static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, + const LegalityQuery &Query, + unsigned Opcode) { + const LLT Ty = Query.Types[0]; + + // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD + const bool IsLoad = Opcode != AMDGPU::G_STORE; + + unsigned RegSize = Ty.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned AS = Query.Types[1].getAddressSpace(); + + // All of these need to be custom lowered to cast the pointer operand. + if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return false; + + // TODO: We should be able to widen loads if the alignment is high enough, but + // we also need to modify the memory access size. +#if 0 + // Accept widening loads based on alignment. + if (IsLoad && MemSize < Size) + MemSize = std::max(MemSize, Align); +#endif + + // Only 1-byte and 2-byte to 32-bit extloads are valid. + if (MemSize != RegSize && RegSize != 32) + return false; + + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) + return false; + + switch (MemSize) { + case 8: + case 16: + case 32: + case 64: + case 128: + break; + case 96: + if (!ST.hasDwordx3LoadStores()) + return false; + break; + case 256: + case 512: + // These may contextually need to be broken down. + break; + default: + return false; + } + + assert(RegSize >= MemSize); + + if (Align < MemSize) { + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) + return false; + } + + return true; +} + +// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so +// workaround this. Eventually it should ignore the type for loads and only care +// about the size. Return true in cases where we will workaround this for now by +// bitcasting. +static bool loadStoreBitcastWorkaround(const LLT Ty) { + if (EnableNewLegality) + return false; + + const unsigned Size = Ty.getSizeInBits(); + if (Size <= 64) + return false; + if (!Ty.isVector()) + return true; + unsigned EltSize = Ty.getElementType().getSizeInBits(); + return EltSize != 32 && EltSize != 64; +} + +static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, + unsigned Opcode) { + const LLT Ty = Query.Types[0]; + return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && + !loadStoreBitcastWorkaround(Ty); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -170,14 +329,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; const LLT S1 = LLT::scalar(1); - const LLT S8 = LLT::scalar(8); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); - const LLT S96 = LLT::scalar(96); const LLT S128 = LLT::scalar(128); const LLT S256 = LLT::scalar(256); - const LLT S1024 = LLT::scalar(1024); + const LLT S512 = LLT::scalar(512); + const LLT MaxScalar = LLT::scalar(MaxRegisterSize); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); @@ -244,6 +402,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, S32, S64, S16, V2S16 }; + const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; + setAction({G_BRCOND, S1}, Legal); // VCC branches setAction({G_BRCOND, S32}, Legal); // SCC branches @@ -261,11 +421,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .legalIf(isPointer(0)); - if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32, S16, V2S16}) + .clampScalar(0, S16, S32) + .clampMaxNumElements(0, S16, 2) + .scalarize(0) + .widenScalarToNextPow2(0, 32); + } else if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16}) .clampScalar(0, S16, S32) - .scalarize(0); + .scalarize(0) + .widenScalarToNextPow2(0, 32); } else { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32}) @@ -275,7 +443,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Not really legal. Placeholder for custom lowering. getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) - .legalFor({S32, S64}) + .customFor({S32, S64}) .clampScalar(0, S32, S64) .widenScalarToNextPow2(0, 32) .scalarize(0); @@ -298,35 +466,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}, {S32, S32}}) - .clampScalar(0, S32, S32) - .scalarize(0); // TODO: Implement. - - getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) + .minScalar(0, S32) + // TODO: .scalarize(0) .lower(); getActionDefinitionsBuilder(G_BITCAST) // Don't worry about the size constraint. .legalIf(all(isRegisterType(0), isRegisterType(1))) - // FIXME: Testing hack - .legalForCartesianProduct({S16, LLT::vector(2, 8), }); - - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64, S16}) - .clampScalar(0, S16, S64); - - getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, - ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampScalarOrElt(0, S32, S1024) - .legalIf(isMultiple32(0)) - .widenScalarToNextPow2(0, 32) - .clampMaxNumElements(0, S32, 16); + .lower(); - // FIXME: i1 operands to intrinsics should always be legal, but other i1 - // values may not be legal. We need to figure out how to distinguish - // between these two scenarios. getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) @@ -334,10 +483,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0) .legalIf(isPointer(0)); + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); + + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) + .legalIf(isRegisterType(0)) + // s1 and s16 are special cases because they have legal operations on + // them, but don't really occupy registers in the normal way. + .legalFor({S1, S16}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampScalarOrElt(0, S32, MaxScalar) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16); + setAction({G_FRAME_INDEX, PrivatePtr}, Legal); - getActionDefinitionsBuilder(G_GLOBAL_VALUE) - .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); + // If the amount is divergent, we have to do a wave reduction to get the + // maximum value, so this is expanded during RegBankSelect. + getActionDefinitionsBuilder(G_DYN_STACKALLOC) + .legalFor({{PrivatePtr, S32}}); + + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .unsupportedFor({PrivatePtr}) + .custom(); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &FPOpActions = getActionDefinitionsBuilder( { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) @@ -397,33 +567,41 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S16, S64); - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); + + if (ST.hasFractBug()) { + getActionDefinitionsBuilder(G_FFLOOR) + .customFor({S64}) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } else { + getActionDefinitionsBuilder(G_FFLOOR) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } } getActionDefinitionsBuilder(G_FPTRUNC) .legalFor({{S32, S64}, {S16, S32}}) - .scalarize(0); + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_FPEXT) .legalFor({{S64, S32}, {S32, S16}}) .lowerFor({{S64, S16}}) // FIXME: Implement .scalarize(0); - // TODO: Verify V_BFI_B32 is generated from expanded bit ops. - getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); - getActionDefinitionsBuilder(G_FSUB) // Use actual fsub instruction .legalFor({S32}) @@ -434,22 +612,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Whether this is legal depends on the floating point mode for the function. auto &FMad = getActionDefinitionsBuilder(G_FMAD); - if (ST.hasMadF16()) + if (ST.hasMadF16() && ST.hasMadMacF32Insts()) FMad.customFor({S32, S16}); - else + else if (ST.hasMadMacF32Insts()) FMad.customFor({S32}); + else if (ST.hasMadF16()) + FMad.customFor({S16}); FMad.scalarize(0) .lower(); + // TODO: Do we need to clamp maximum bitwidth? + getActionDefinitionsBuilder(G_TRUNC) + .legalIf(isScalar(0)) + .legalFor({{V2S16, V2S32}}) + .clampMaxNumElements(0, S16, 2) + // Avoid scalarizing in cases that should be truly illegal. In unresolvable + // situations (like an invalid implicit use), we don't want to infinite loop + // in the legalizer. + .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) + .alwaysLegal(); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, - {S32, S1}, {S64, S1}, {S16, S1}, - {S96, S32}, - // FIXME: Hack - {S64, LLT::scalar(33)}, - {S32, S8}, {S32, LLT::scalar(24)}}) + {S32, S1}, {S64, S1}, {S16, S1}}) .scalarize(0) - .clampScalar(0, S32, S64); + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(1, 32); // TODO: Split s1->s64 during regbankselect for VALU. auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) @@ -460,17 +648,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) IToFP.legalFor({{S16, S16}}); IToFP.clampScalar(1, S32, S64) - .scalarize(0); + .scalarize(0) + .widenScalarToNextPow2(1); auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); + .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) + .customFor({{S64, S64}}); if (ST.has16BitInsts()) FPToI.legalFor({{S16, S16}}); else FPToI.minScalar(1, S32); FPToI.minScalar(0, S32) - .scalarize(0); + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_INTRINSIC_ROUND) .scalarize(0) @@ -494,16 +685,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } + // FIXME: Clamp offset operand. getActionDefinitionsBuilder(G_PTR_ADD) - .legalForCartesianProduct(AddrSpaces64, {S64}) - .legalForCartesianProduct(AddrSpaces32, {S32}) + .legalIf(isPointer(0)) .scalarize(0); - getActionDefinitionsBuilder(G_PTR_MASK) - .scalarize(0) - .alwaysLegal(); - - setAction({G_BLOCK_ADDR, CodePtr}, Legal); + getActionDefinitionsBuilder(G_PTRMASK) + .legalIf(typeInSet(1, {S64, S32})) + .minScalar(1, S32) + .maxScalarIf(sizeIs(0, 32), 1, S32) + .maxScalarIf(sizeIs(0, 64), 1, S64) + .scalarize(0); auto &CmpBuilder = getActionDefinitionsBuilder(G_ICMP) @@ -537,16 +729,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(1, S32, S64) .scalarize(0); - // FIXME: fexp, flog2, flog10 needs to be custom lowered. - getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, - G_FLOG, G_FLOG2, G_FLOG10}) - .legalFor({S32}) - .scalarize(0); + // FIXME: fpow has a selection pattern that should move to custom lowering. + auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); + if (ST.has16BitInsts()) + Exp2Ops.legalFor({S32, S16}); + else + Exp2Ops.legalFor({S32}); + Exp2Ops.clampScalar(0, MinScalarFPTy, S32); + Exp2Ops.scalarize(0); + + auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); + if (ST.has16BitInsts()) + ExpOps.customFor({{S32}, {S16}}); + else + ExpOps.customFor({S32}); + ExpOps.clampScalar(0, MinScalarFPTy, S32) + .scalarize(0); + + // The 64-bit versions produce 32-bit results, but only on the SALU. + getActionDefinitionsBuilder(G_CTPOP) + .legalFor({{S32, S32}, {S32, S64}}) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32); + + // The hardware instructions return a different result on 0 than the generic + // instructions expect. The hardware produces -1, but these produce the + // bitwidth. + getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) + .scalarize(0) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32) + .lower(); // The 64-bit versions produce 32-bit results, but only on the SALU. - getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, - G_CTTZ, G_CTTZ_ZERO_UNDEF, - G_CTPOP}) + getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) .legalFor({{S32, S32}, {S32, S64}}) .clampScalar(0, S32, S32) .clampScalar(1, S32, S64) @@ -554,50 +775,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); - // TODO: Expand for > s32 - getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) + getActionDefinitionsBuilder(G_BITREVERSE) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S16, S32, V2S16}) + .clampMaxNumElements(0, S16, 2) + // FIXME: Fixing non-power-of-2 before clamp is workaround for + // narrowScalar limitation. + .widenScalarToNextPow2(0) + .clampScalar(0, S16, S32) + .scalarize(0); + if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32, S16, V2S16}) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .clampMaxNumElements(0, S16, 2) - .clampScalar(0, S16, S32) + .minScalar(0, S16) .widenScalarToNextPow2(0) - .scalarize(0); + .scalarize(0) + .lower(); } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32, S16}) .widenScalarToNextPow2(0) - .clampScalar(0, S16, S32) - .scalarize(0); + .minScalar(0, S16) + .scalarize(0) + .lower(); } } else { + // TODO: Should have same legality without v_perm_b32 + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S32}) + .lowerIf(scalarNarrowerThan(0, 32)) + // FIXME: Fixing non-power-of-2 before clamp is workaround for + // narrowScalar limitation. + .widenScalarToNextPow2(0) + .maxScalar(0, S32) + .scalarize(0) + .lower(); + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32}) - .clampScalar(0, S32, S32) + .minScalar(0, S32) .widenScalarToNextPow2(0) - .scalarize(0); + .scalarize(0) + .lower(); } - auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx0].getSizeInBits() < - Query.Types[TypeIdx1].getSizeInBits(); - }; - }; - - auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx0].getSizeInBits() > - Query.Types[TypeIdx1].getSizeInBits(); - }; - }; - getActionDefinitionsBuilder(G_INTTOPTR) // List the common cases .legalForCartesianProduct(AddrSpaces64, {S64}) @@ -609,7 +838,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, [](const LegalityQuery &Query) { return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); }) - .narrowScalarIf(greaterThan(1, 0), + .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); }); @@ -626,7 +855,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }) .narrowScalarIf( - greaterThan(0, 1), + largerThan(0, 1), [](const LegalityQuery &Query) { return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }); @@ -635,33 +864,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .custom(); - // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we - // handle some operations by just promoting the register during - // selection. There are also d16 loads on GFX9+ which preserve the high bits. - auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { - switch (AS) { - // FIXME: Private element size. - case AMDGPUAS::PRIVATE_ADDRESS: - return 32; - // FIXME: Check subtarget - case AMDGPUAS::LOCAL_ADDRESS: - return ST.useDS128() ? 128 : 64; - - // Treat constant and global as identical. SMRD loads are sometimes usable - // for global loads (ideally constant address space should be eliminated) - // depending on the context. Legality cannot be context dependent, but - // RegBankSelect can split the load as necessary depending on the pointer - // register bank/uniformity and if the memory is invariant or not written in - // a kernel. - case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::GLOBAL_ADDRESS: - return 512; - default: - return 128; - } - }; - - const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { + const auto needToSplitMemOp = [=](const LegalityQuery &Query, + bool IsLoad) -> bool { const LLT DstTy = Query.Types[0]; // Split vector extloads. @@ -676,14 +880,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PtrTy = Query.Types[1]; unsigned AS = PtrTy.getAddressSpace(); - if (MemSize > maxSizeForAddrSpace(AS)) + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) return true; // Catch weird sized loads that don't evenly divide into the access sizes // TODO: May be able to widen depending on alignment etc. - unsigned NumRegs = MemSize / 32; - if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) - return true; + unsigned NumRegs = (MemSize + 31) / 32; + if (NumRegs == 3) { + if (!ST.hasDwordx3LoadStores()) + return true; + } else { + // If the alignment allows, these should have been widened. + if (!isPowerOf2_32(NumRegs)) + return true; + } if (Align < MemSize) { const SITargetLowering *TLI = ST.getTargetLowering(); @@ -693,6 +903,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; + const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, + unsigned Opc) -> bool { + unsigned Size = Query.Types[0].getSizeInBits(); + if (isPowerOf2_32(Size)) + return false; + + if (Size == 96 && ST.hasDwordx3LoadStores()) + return false; + + unsigned AddrSpace = Query.Types[1].getAddressSpace(); + if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) + return false; + + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned RoundedSize = NextPowerOf2(Size); + return (Align >= RoundedSize); + }; + unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; @@ -705,17 +933,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const bool IsStore = Op == G_STORE; auto &Actions = getActionDefinitionsBuilder(Op); - // Whitelist the common cases. - // TODO: Pointer loads - // TODO: Wide constant loads - // TODO: Only CI+ has 3x loads - // TODO: Loads to s16 on gfx9 + // Explicitly list some common cases. + // TODO: Does this help compile time at all? Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, {V2S32, GlobalPtr, 64, GlobalAlign32}, - {V3S32, GlobalPtr, 96, GlobalAlign32}, - {S96, GlobalPtr, 96, GlobalAlign32}, {V4S32, GlobalPtr, 128, GlobalAlign32}, - {S128, GlobalPtr, 128, GlobalAlign32}, {S64, GlobalPtr, 64, GlobalAlign32}, {V2S64, GlobalPtr, 128, GlobalAlign32}, {V2S16, GlobalPtr, 32, GlobalAlign32}, @@ -734,23 +956,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {S32, PrivatePtr, 16, 16}, {V2S16, PrivatePtr, 32, 32}, - {S32, FlatPtr, 32, GlobalAlign32}, - {S32, FlatPtr, 16, GlobalAlign16}, - {S32, FlatPtr, 8, GlobalAlign8}, - {V2S16, FlatPtr, 32, GlobalAlign32}, - {S32, ConstantPtr, 32, GlobalAlign32}, {V2S32, ConstantPtr, 64, GlobalAlign32}, - {V3S32, ConstantPtr, 96, GlobalAlign32}, {V4S32, ConstantPtr, 128, GlobalAlign32}, {S64, ConstantPtr, 64, GlobalAlign32}, - {S128, ConstantPtr, 128, GlobalAlign32}, {V2S32, ConstantPtr, 32, GlobalAlign32}}); + Actions.legalIf( + [=](const LegalityQuery &Query) -> bool { + return isLoadStoreLegal(ST, Query, Op); + }); + + // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to + // 64-bits. + // + // TODO: Should generalize bitcast action into coerce, which will also cover + // inserting addrspacecasts. + Actions.customIf(typeIs(1, Constant32Ptr)); + + // Turn any illegal element vectors into something easier to deal + // with. These will ultimately produce 32-bit scalar shifts to extract the + // parts anyway. + // + // For odd 16-bit element vectors, prefer to split those into pieces with + // 16-bit vector parts. + Actions.bitcastIf( + [=](const LegalityQuery &Query) -> bool { + const LLT Ty = Query.Types[0]; + const unsigned Size = Ty.getSizeInBits(); + + if (Size != Query.MMODescrs[0].SizeInBits) + return Size <= 32 && Ty.isVector(); + + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + return true; + return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + !isRegisterVectorElementType(Ty.getElementType()); + }, bitcastToRegisterType(0)); + Actions .customIf(typeIs(1, Constant32Ptr)) + // Widen suitably aligned loads by loading extra elements. + .moreElementsIf([=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return Op == G_LOAD && Ty.isVector() && + shouldWidenLoadResult(Query, Op); + }, moreElementsToNextPow2(0)) + .widenScalarIf([=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return Op == G_LOAD && !Ty.isVector() && + shouldWidenLoadResult(Query, Op); + }, widenScalarOrEltToNextPow2(0)) .narrowScalarIf( [=](const LegalityQuery &Query) -> bool { - return !Query.Types[0].isVector() && needToSplitLoad(Query); + return !Query.Types[0].isVector() && + needToSplitMemOp(Query, Op == G_LOAD); }, [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { const LLT DstTy = Query.Types[0]; @@ -763,13 +1022,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (DstSize > MemSize) return std::make_pair(0, LLT::scalar(MemSize)); + if (!isPowerOf2_32(DstSize)) { + // We're probably decomposing an odd sized store. Try to split + // to the widest type. TODO: Account for alignment. As-is it + // should be OK, since the new parts will be further legalized. + unsigned FloorSize = PowerOf2Floor(DstSize); + return std::make_pair(0, LLT::scalar(FloorSize)); + } + if (DstSize > 32 && (DstSize % 32 != 0)) { // FIXME: Need a way to specify non-extload of larger size if // suitably aligned. return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); } - unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + unsigned MaxSize = maxSizeForAddrSpace(ST, + PtrTy.getAddressSpace(), + Op == G_LOAD); if (MemSize > MaxSize) return std::make_pair(0, LLT::scalar(MaxSize)); @@ -778,18 +1047,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }) .fewerElementsIf( [=](const LegalityQuery &Query) -> bool { - return Query.Types[0].isVector() && needToSplitLoad(Query); + return Query.Types[0].isVector() && + needToSplitMemOp(Query, Op == G_LOAD); }, [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { const LLT DstTy = Query.Types[0]; const LLT PtrTy = Query.Types[1]; LLT EltTy = DstTy.getElementType(); - unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + unsigned MaxSize = maxSizeForAddrSpace(ST, + PtrTy.getAddressSpace(), + Op == G_LOAD); + + // FIXME: Handle widened to power of 2 results better. This ends + // up scalarizing. + // FIXME: 3 element stores scalarized on SI // Split if it's too large for the address space. if (Query.MMODescrs[0].SizeInBits > MaxSize) { unsigned NumElts = DstTy.getNumElements(); + unsigned EltSize = EltTy.getSizeInBits(); + + if (MaxSize % EltSize == 0) { + return std::make_pair( + 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); + } + unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; // FIXME: Refine when odd breakdowns handled @@ -802,9 +1085,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, LLT::vector(NumElts / NumPieces, EltTy)); } + // FIXME: We could probably handle weird extending loads better. + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (DstTy.getSizeInBits() > MemSize) + return std::make_pair(0, EltTy); + + unsigned EltSize = EltTy.getSizeInBits(); + unsigned DstSize = DstTy.getSizeInBits(); + if (!isPowerOf2_32(DstSize)) { + // We're probably decomposing an odd sized store. Try to split + // to the widest type. TODO: Account for alignment. As-is it + // should be OK, since the new parts will be further legalized. + unsigned FloorSize = PowerOf2Floor(DstSize); + return std::make_pair( + 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); + } + // Need to split because of alignment. unsigned Align = Query.MMODescrs[0].AlignInBits; - unsigned EltSize = EltTy.getSizeInBits(); if (EltSize > Align && (EltSize / Align < DstTy.getNumElements())) { return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); @@ -820,39 +1118,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // TODO: Need a bitcast lower option? Actions - .legalIf([=](const LegalityQuery &Query) { - const LLT Ty0 = Query.Types[0]; - unsigned Size = Ty0.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - unsigned Align = Query.MMODescrs[0].AlignInBits; - - // FIXME: Widening store from alignment not valid. - if (MemSize < Size) - MemSize = std::max(MemSize, Align); - - // No extending vector loads. - if (Size > MemSize && Ty0.isVector()) - return false; - - switch (MemSize) { - case 8: - case 16: - return Size == 32; - case 32: - case 64: - case 128: - return true; - case 96: - return ST.hasDwordx3LoadStores(); - case 256: - case 512: - return true; - default: - return false; - } - }) .widenScalarToNextPow2(0) - // TODO: v3s32->v4s32 with alignment .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); } @@ -886,8 +1152,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } - getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}}); + if (ST.hasLDSFPAtomics()) { + getActionDefinitionsBuilder(G_ATOMICRMW_FADD) + .legalFor({{S32, LocalPtr}}); + } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling @@ -896,10 +1164,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {S32, FlatPtr}, {S64, FlatPtr}}) .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, {S32, RegionPtr}, {S64, RegionPtr}}); - - getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) - .lower(); - // TODO: Pointer types, any 32-bit or 64-bit vector // Condition should be s32 for scalar, s1 for vector. @@ -908,9 +1172,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) .clampScalar(0, S16, S64) + .scalarize(1) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .fewerElementsIf(numElementsNotEven(0), scalarize(0)) - .scalarize(1) .clampMaxNumElements(0, S32, 2) .clampMaxNumElements(0, LocalPtr, 2) .clampMaxNumElements(0, PrivatePtr, 2) @@ -924,12 +1188,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({{S32, S32}, {S64, S32}}); if (ST.has16BitInsts()) { if (ST.hasVOP3PInsts()) { - Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) + Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) .clampMaxNumElements(0, S16, 2); } else - Shifts.legalFor({{S16, S32}, {S16, S16}}); + Shifts.legalFor({{S16, S16}}); - // TODO: Support 16-bit shift amounts + // TODO: Support 16-bit shift amounts for all types + Shifts.widenScalarIf( + [=](const LegalityQuery &Query) { + // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a + // 32-bit amount. + const LLT ValTy = Query.Types[0]; + const LLT AmountTy = Query.Types[1]; + return ValTy.getSizeInBits() <= 16 && + AmountTy.getSizeInBits() < 16; + }, changeTo(1, S16)); + Shifts.maxScalarIf(typeIs(0, S16), 1, S16); Shifts.clampScalar(1, S32, S32); Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); @@ -956,7 +1230,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return (EltTy.getSizeInBits() == 16 || EltTy.getSizeInBits() % 32 == 0) && VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 1024 && + VecTy.getSizeInBits() <= MaxRegisterSize && IdxTy.getSizeInBits() == 32; }) .clampScalar(EltTypeIdx, S32, S64) @@ -1008,28 +1282,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampNumElements(0, V2S64, V16S64) .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); - if (ST.hasScalarPackInsts()) - BuildVector.legalFor({V2S16, S32}); - - BuildVector - .minScalarSameAs(1, 0) - .legalIf(isRegisterType(0)) - .minScalarOrElt(0, S32); - if (ST.hasScalarPackInsts()) { + BuildVector + // FIXME: Should probably widen s1 vectors straight to s32 + .minScalarOrElt(0, S16) + // Widen source elements and produce a G_BUILD_VECTOR_TRUNC + .minScalar(1, S32); + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) .legalFor({V2S16, S32}) .lower(); + BuildVector.minScalarOrElt(0, S32); } else { + BuildVector.customFor({V2S16, S16}); + BuildVector.minScalarOrElt(0, S32); + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .customFor({V2S16, S32}) .lower(); } + BuildVector.legalIf(isRegisterType(0)); + + // FIXME: Clamp maximum size getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalIf(isRegisterType(0)); - // TODO: Don't fully scalarize v2s16 pieces - getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); + // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse + // pre-legalize. + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .customFor({V2S16, V2S16}) + .lower(); + } else + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { @@ -1037,10 +1323,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { - const LLT &Ty = Query.Types[TypeIdx]; + const LLT Ty = Query.Types[TypeIdx]; if (Ty.isVector()) { const LLT &EltTy = Ty.getElementType(); - if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) return true; if (!isPowerOf2_32(EltTy.getSizeInBits())) return true; @@ -1049,25 +1335,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; auto &Builder = getActionDefinitionsBuilder(Op) + .lowerFor({{S16, V2S16}}) + .lowerIf([=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + return BigTy.getSizeInBits() == 32; + }) + // Try to widen to s16 first for small types. + // TODO: Only do this on targets with legal s16 shifts + .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) - // Clamp the little scalar to s8-s256 and make it a power of 2. It's not - // worth considering the multiples of 64 since 2*192 and 2*384 are not - // valid. - .clampScalar(LitTyIdx, S16, S256) - .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), elementTypeIs(1, S16)), changeTo(1, V2S16)) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, S32, S512) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) // Break up vectors with weird elements into scalars .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, scalarize(0)) .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, scalarize(1)) - .clampScalar(BigTyIdx, S32, S1024) - .lowerFor({{S16, V2S16}}); + .clampScalar(BigTyIdx, S32, MaxScalar); if (Op == G_MERGE_VALUES) { Builder.widenScalarIf( @@ -1108,22 +1401,68 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return BigTy.getSizeInBits() % 16 == 0 && LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= 1024; + BigTy.getSizeInBits() <= MaxRegisterSize; }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); } - getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + // S64 is only legal on SALU, and needs to be broken into 32-bit elements in + // RegBankSelect. + auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) + .legalFor({{S32}, {S64}}); + + if (ST.hasVOP3PInsts()) { + SextInReg.lowerFor({{V2S16}}) + // Prefer to reduce vector widths for 16-bit vectors before lowering, to + // get more vector shift opportunities, since we'll get those when + // expanded. + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); + } else if (ST.has16BitInsts()) { + SextInReg.lowerFor({{S32}, {S64}, {S16}}); + } else { + // Prefer to promote to s32 before lowering if we don't have 16-bit + // shifts. This avoid a lot of intermediate truncate and extend operations. + SextInReg.lowerFor({{S32}, {S64}}); + } + + // FIXME: Placeholder rule. Really depends on whether the clamp modifier is + // available, and is selectively legal for s16, s32, v2s16. + getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) + .scalarize(0) + .clampScalar(0, S16, S32); - getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); + SextInReg + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); + + getActionDefinitionsBuilder(G_FSHR) + .legalFor({{S32, S32}}) + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); + getActionDefinitionsBuilder({ + // TODO: Verify V_BFI_B32 is generated from expanded bit ops + G_FCOPYSIGN, + + G_ATOMIC_CMPXCHG_WITH_SUCCESS, + G_READ_REGISTER, + G_WRITE_REGISTER, + + G_SADDO, G_SSUBO, + + // TODO: Implement + G_FMINIMUM, G_FMAXIMUM, + G_FSHL + }).lower(); + getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, - G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, + G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); @@ -1131,10 +1470,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, verify(*ST.getInstrInfo()); } -bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - GISelChangeObserver &Observer) const { +bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: return legalizeAddrSpaceCast(MI, MRI, B); @@ -1148,15 +1489,21 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeITOFP(MI, MRI, B, true); case TargetOpcode::G_UITOFP: return legalizeITOFP(MI, MRI, B, false); + case TargetOpcode::G_FPTOSI: + return legalizeFPTOI(MI, MRI, B, true); + case TargetOpcode::G_FPTOUI: + return legalizeFPTOI(MI, MRI, B, false); case TargetOpcode::G_FMINNUM: case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM_IEEE: case TargetOpcode::G_FMAXNUM_IEEE: - return legalizeMinNumMaxNum(MI, MRI, B); + return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); case TargetOpcode::G_INSERT_VECTOR_ELT: return legalizeInsertVectorElt(MI, MRI, B); + case TargetOpcode::G_SHUFFLE_VECTOR: + return legalizeShuffleVector(MI, MRI, B); case TargetOpcode::G_FSIN: case TargetOpcode::G_FCOS: return legalizeSinCos(MI, MRI, B); @@ -1168,8 +1515,26 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: return legalizeFDIV(MI, MRI, B); + case TargetOpcode::G_UDIV: + case TargetOpcode::G_UREM: + return legalizeUDIV_UREM(MI, MRI, B); + case TargetOpcode::G_SDIV: + case TargetOpcode::G_SREM: + return legalizeSDIV_SREM(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); + case TargetOpcode::G_FLOG: + return legalizeFlog(MI, B, numbers::ln2f); + case TargetOpcode::G_FLOG10: + return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); + case TargetOpcode::G_FEXP: + return legalizeFExp(MI, B); + case TargetOpcode::G_FPOW: + return legalizeFPow(MI, B); + case TargetOpcode::G_FFLOOR: + return legalizeFFloor(MI, MRI, B); + case TargetOpcode::G_BUILD_VECTOR: + return legalizeBuildVector(MI, MRI, B); default: return false; } @@ -1201,7 +1566,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - Register ApertureReg = MRI.createGenericVirtualRegister(S32); Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); B.buildInstr(AMDGPU::S_GETREG_B32) @@ -1210,12 +1574,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MRI.setType(GetReg, S32); auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); - B.buildInstr(TargetOpcode::G_SHL) - .addDef(ApertureReg) - .addUse(GetReg) - .addUse(ShiftAmt.getReg(0)); - - return ApertureReg; + return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } Register QueuePtr = MRI.createGenericVirtualRegister( @@ -1232,19 +1591,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // TODO: can we be smarter about machine pointer info? MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, - MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - 4, - MinAlign(64, StructOffset)); - - Register LoadResult = MRI.createGenericVirtualRegister(S32); + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 4, commonAlignment(Align(64), StructOffset)); + Register LoadAddr; B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); - B.buildLoad(LoadResult, LoadAddr, *MMO); - return LoadResult; + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( @@ -1252,8 +1607,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineIRBuilder &B) const { MachineFunction &MF = B.getMF(); - B.setInstr(MI); - const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -1292,7 +1645,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( // extra ptrtoint would be kind of pointless. auto HighAddr = B.buildConstant( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); - B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); + B.buildMerge(Dst, {Src, HighAddr}); MI.eraseFromParent(); return true; } @@ -1305,13 +1658,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( auto SegmentNull = B.buildConstant(DstTy, NullVal); auto FlatNull = B.buildConstant(SrcTy, 0); - Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); - // Extract low 32-bits of the pointer. - B.buildExtract(PtrLo32, Src, 0); + auto PtrLo32 = B.buildExtract(DstTy, Src, 0); - Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); MI.eraseFromParent(); @@ -1333,21 +1684,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (!ApertureReg.isValid()) return false; - Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); - - Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = MRI.createGenericVirtualRegister(S32); - B.buildInstr(TargetOpcode::G_PTRTOINT) - .addDef(SrcAsInt) - .addUse(Src); + Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? - B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); - B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); MI.eraseFromParent(); return true; @@ -1356,8 +1702,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( bool AMDGPULegalizerInfo::legalizeFrint( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); - Register Src = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Src); assert(Ty.isScalar() && Ty.getSizeInBits() == 64); @@ -1383,7 +1727,6 @@ bool AMDGPULegalizerInfo::legalizeFrint( bool AMDGPULegalizerInfo::legalizeFceil( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); const LLT S1 = LLT::scalar(1); const LLT S64 = LLT::scalar(64); @@ -1395,7 +1738,7 @@ bool AMDGPULegalizerInfo::legalizeFceil( // if (src > 0.0 && src != result) // result += 1.0 - auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); + auto Trunc = B.buildIntrinsicTrunc(S64, Src); const auto Zero = B.buildFConstant(S64, 0.0); const auto One = B.buildFConstant(S64, 1.0); @@ -1428,8 +1771,6 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi, bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); - const LLT S1 = LLT::scalar(1); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); @@ -1456,7 +1797,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( const auto Zero32 = B.buildConstant(S32, 0); // Extend back to 64-bits. - auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); + auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); auto Shr = B.buildAShr(S64, FractMask, Exp); auto Not = B.buildNot(S64, Shr); @@ -1474,7 +1815,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( bool AMDGPULegalizerInfo::legalizeITOFP( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const { - B.setInstr(MI); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -1503,10 +1843,44 @@ bool AMDGPULegalizerInfo::legalizeITOFP( return true; } -bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( +// TODO: Copied from DAG implementation. Verify logic and document how this +// actually works. +bool AMDGPULegalizerInfo::legalizeFPTOI( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { - MachineFunction &MF = B.getMF(); + MachineIRBuilder &B, bool Signed) const { + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); + + unsigned Flags = MI.getFlags(); + + auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); + auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); + auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); + + auto Mul = B.buildFMul(S64, Trunc, K0, Flags); + auto FloorMul = B.buildFFloor(S64, Mul, Flags); + auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); + + auto Hi = Signed ? + B.buildFPTOSI(S32, FloorMul) : + B.buildFPTOUI(S32, FloorMul); + auto Lo = B.buildFPTOUI(S32, Fma); + + B.buildMerge(Dst, { Lo, Hi }); + MI.eraseFromParent(); + + return true; +} + +bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || @@ -1520,10 +1894,6 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( if (IsIEEEOp) return true; - MachineIRBuilder HelperBuilder(MI); - GISelObserverWrapper DummyObserver; - LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setInstr(MI); return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; } @@ -1533,8 +1903,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( // TODO: Should move some of this into LegalizerHelper. // TODO: Promote dynamic indexing of s16 to s32 - // TODO: Dynamic s64 indexing is only legal for SGPR. - Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); + + // FIXME: Artifact combiner probably should have replaced the truncated + // constant before this, so we shouldn't need + // getConstantVRegValWithLookThrough. + Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( + MI.getOperand(2).getReg(), MRI); if (!IdxVal) // Dynamic case will be selected to register indexing. return true; @@ -1545,10 +1919,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - B.setInstr(MI); - - if (IdxVal.getValue() < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); + if (IdxVal->Value < VecTy.getNumElements()) + B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -1562,8 +1934,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( // TODO: Should move some of this into LegalizerHelper. // TODO: Promote dynamic indexing of s16 to s32 - // TODO: Dynamic s64 indexing is only legal for SGPR. - Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); + + // FIXME: Artifact combiner probably should have replaced the truncated + // constant before this, so we shouldn't need + // getConstantVRegValWithLookThrough. + Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( + MI.getOperand(3).getReg(), MRI); if (!IdxVal) // Dynamic case will be selected to register indexing. return true; @@ -1575,10 +1951,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); - B.setInstr(MI); - - if (IdxVal.getValue() < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); + if (IdxVal->Value < VecTy.getNumElements()) + B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -1586,10 +1960,29 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; } +bool AMDGPULegalizerInfo::legalizeShuffleVector( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT V2S16 = LLT::vector(2, 16); + + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src0); + + if (SrcTy == V2S16 && DstTy == V2S16 && + AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) + return true; + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); + return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; +} + bool AMDGPULegalizerInfo::legalizeSinCos( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); @@ -1597,7 +1990,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos( unsigned Flags = MI.getFlags(); Register TrigVal; - auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); + auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); if (ST.hasTrigReducedRange()) { auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) @@ -1615,10 +2008,12 @@ bool AMDGPULegalizerInfo::legalizeSinCos( return true; } -bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( - Register DstReg, LLT PtrTy, - MachineIRBuilder &B, const GlobalValue *GV, - unsigned Offset, unsigned GAFlags) const { +bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, + MachineIRBuilder &B, + const GlobalValue *GV, + int64_t Offset, + unsigned GAFlags) const { + assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered // to the following code sequence: // @@ -1681,19 +2076,37 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( const GlobalValue *GV = MI.getOperand(1).getGlobal(); MachineFunction &MF = B.getMF(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - B.setInstr(MI); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isEntryFunction()) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( - Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); + Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), + DS_Warning); Fn.getContext().diagnose(BadLDSDecl); + + // We currently don't have a way to correctly allocate LDS objects that + // aren't directly associated with a kernel. We do force inlining of + // functions that use local objects. However, if these dead functions are + // not eliminated, we don't want a compile time error. Just emit a warning + // and a trap, since there should be no callable path here. + B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; } // TODO: We could emit code to handle the initialization somewhere. if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { - B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->shouldUseLDSConstAddress(GV)) { + MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); + return true; // Leave in place; + } + + B.buildConstant( + DstReg, + MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); MI.eraseFromParent(); return true; } @@ -1723,10 +2136,10 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); MachineMemOperand *GOTMMO = MF.getMachineMemOperand( - MachinePointerInfo::getGOT(MF), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - 8 /*Size*/, 8 /*Align*/); + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 8 /*Size*/, Align(8)); buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); @@ -1744,7 +2157,6 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( bool AMDGPULegalizerInfo::legalizeLoad( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const { - B.setInstr(MI); LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); Observer.changingInstr(MI); @@ -1763,16 +2175,15 @@ bool AMDGPULegalizerInfo::legalizeFMad( const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // TODO: Always legal with future ftz flag. - if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) + // FIXME: Do we need just output? + if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) return true; - if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) + if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) return true; - MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setMBB(*MI.getParent()); return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; } @@ -1790,7 +2201,6 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( LLT ValTy = MRI.getType(CmpVal); LLT VecTy = LLT::vector(2, ValTy); - B.setInstr(MI); Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) @@ -1803,39 +2213,248 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( return true; } +bool AMDGPULegalizerInfo::legalizeFlog( + MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Flags = MI.getFlags(); + + auto Log2Operand = B.buildFLog2(Ty, Src, Flags); + auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); + + B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + LLT Ty = B.getMRI()->getType(Dst); + + auto K = B.buildFConstant(Ty, numbers::log2e); + auto Mul = B.buildFMul(Ty, Src, K, Flags); + B.buildFExp2(Dst, Mul, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + unsigned Flags = MI.getFlags(); + LLT Ty = B.getMRI()->getType(Dst); + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + + if (Ty == S32) { + auto Log = B.buildFLog2(S32, Src0, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) + .addUse(Log.getReg(0)) + .addUse(Src1) + .setMIFlags(Flags); + B.buildFExp2(Dst, Mul, Flags); + } else if (Ty == S16) { + // There's no f16 fmul_legacy, so we need to convert for it. + auto Log = B.buildFLog2(S16, Src0, Flags); + auto Ext0 = B.buildFPExt(S32, Log, Flags); + auto Ext1 = B.buildFPExt(S32, Src1, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) + .addUse(Ext0.getReg(0)) + .addUse(Ext1.getReg(0)) + .setMIFlags(Flags); + + B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); + } else + return false; + + MI.eraseFromParent(); + return true; +} + +// Find a source register, ignoring any possible source modifiers. +static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { + Register ModSrc = OrigSrc; + if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { + ModSrc = SrcFNeg->getOperand(1).getReg(); + if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + return ModSrc; +} + +bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + Register Dst = MI.getOperand(0).getReg(); + Register OrigSrc = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && + "this should not have been custom lowered"); + + // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) + // is used instead. However, SI doesn't have V_FLOOR_F64, so the most + // efficient way to implement it is using V_FRACT_F64. The workaround for the + // V_FRACT bug is: + // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + // + // Convert floor(x) to (x - fract(x)) + + auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) + .addUse(OrigSrc) + .setMIFlags(Flags); + + // Give source modifier matching some assistance before obscuring a foldable + // pattern. + + // TODO: We can avoid the neg on the fract? The input sign to fract + // shouldn't matter? + Register ModSrc = stripAnySourceMods(OrigSrc, MRI); + + auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); + + Register Min = MRI.createGenericVirtualRegister(S64); + + // We don't need to concern ourselves with the snan handling difference, so + // use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + if (MFI->getMode().IEEE) + B.buildFMinNumIEEE(Min, Fract, Const, Flags); + else + B.buildFMinNum(Min, Fract, Const, Flags); + + Register CorrectedFract = Min; + if (!MI.getFlag(MachineInstr::FmNoNans)) { + auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); + CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); + } + + auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); + B.buildFAdd(Dst, OrigSrc, NegFract, Flags); + + MI.eraseFromParent(); + return true; +} + +// Turn an illegal packed v2s16 build vector into bit operations. +// TODO: This should probably be a bitcast action in LegalizerHelper. +bool AMDGPULegalizerInfo::legalizeBuildVector( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + const LLT S32 = LLT::scalar(32); + assert(MRI.getType(Dst) == LLT::vector(2, 16)); + + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + assert(MRI.getType(Src0) == LLT::scalar(16)); + + auto Merge = B.buildMerge(S32, {Src0, Src1}); + B.buildBitcast(Dst, Merge); + + MI.eraseFromParent(); + return true; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineInstr *&Br) { + MachineInstr *&Br, + MachineBasicBlock *&UncondBrTarget) { Register CondDef = MI.getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDef)) return nullptr; + MachineBasicBlock *Parent = MI.getParent(); MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); - if (UseMI.getParent() != MI.getParent() || + if (UseMI.getParent() != Parent || UseMI.getOpcode() != AMDGPU::G_BRCOND) return nullptr; - // Make sure the cond br is followed by a G_BR + // Make sure the cond br is followed by a G_BR, or is the last instruction. MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); - if (Next != MI.getParent()->end()) { + if (Next == Parent->end()) { + MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); + if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. + return nullptr; + UncondBrTarget = &*NextMBB; + } else { if (Next->getOpcode() != AMDGPU::G_BR) return nullptr; Br = &*Next; + UncondBrTarget = Br->getOperand(0).getMBB(); } return &UseMI; } -Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, - Register Reg, LLT Ty) const { - Register LiveIn = MRI.getLiveInVirtReg(Reg); - if (LiveIn) +Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register LiveIn, + Register PhyReg) const { + assert(PhyReg.isPhysical() && "Physical register expected"); + + // Insert the live-in copy, if required, by defining destination virtual + // register. + // FIXME: It seems EmitLiveInCopies isn't called anywhere? + if (!MRI.getVRegDef(LiveIn)) { + // FIXME: Should have scoped insert pt + MachineBasicBlock &OrigInsBB = B.getMBB(); + auto OrigInsPt = B.getInsertPt(); + + MachineBasicBlock &EntryMBB = B.getMF().front(); + EntryMBB.addLiveIn(PhyReg); + B.setInsertPt(EntryMBB, EntryMBB.begin()); + B.buildCopy(LiveIn, PhyReg); + + B.setInsertPt(OrigInsBB, OrigInsPt); + } + + return LiveIn; +} + +Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register PhyReg, LLT Ty, + bool InsertLiveInCopy) const { + assert(PhyReg.isPhysical() && "Physical register expected"); + + // Get or create virtual live-in regester + Register LiveIn = MRI.getLiveInVirtReg(PhyReg); + if (!LiveIn) { + LiveIn = MRI.createGenericVirtualRegister(Ty); + MRI.addLiveIn(PhyReg, LiveIn); + } + + // When the actual true copy required is from virtual register to physical + // register (to be inserted later), live-in copy insertion from physical + // to register virtual register is not required + if (!InsertLiveInCopy) return LiveIn; - Register NewReg = MRI.createGenericVirtualRegister(Ty); - MRI.addLiveIn(Reg, NewReg); - return NewReg; + return insertLiveInCopy(B, MRI, LiveIn, PhyReg); +} + +const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( + MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + LLT ArgTy; + std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); + if (!Arg) { + LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + return nullptr; + } + return Arg; } bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, @@ -1843,12 +2462,14 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, if (!Arg->isRegister() || !Arg->getRegister().isValid()) return false; // TODO: Handle these - assert(Arg->getRegister().isPhysical()); + Register SrcReg = Arg->getRegister(); + assert(SrcReg.isPhysical() && "Physical register expected"); + assert(DstReg.isVirtual() && "Virtual register expected"); MachineRegisterInfo &MRI = *B.getMRI(); LLT Ty = MRI.getType(DstReg); - Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); + Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? @@ -1864,56 +2485,31 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, } B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); - } else + } else { B.buildCopy(DstReg, LiveIn); - - // Insert the argument copy if it doens't already exist. - // FIXME: It seems EmitLiveInCopies isn't called anywhere? - if (!MRI.getVRegDef(LiveIn)) { - // FIXME: Should have scoped insert pt - MachineBasicBlock &OrigInsBB = B.getMBB(); - auto OrigInsPt = B.getInsertPt(); - - MachineBasicBlock &EntryMBB = B.getMF().front(); - EntryMBB.addLiveIn(Arg->getRegister()); - B.setInsertPt(EntryMBB, EntryMBB.begin()); - B.buildCopy(LiveIn, Arg->getRegister()); - - B.setInsertPt(OrigInsBB, OrigInsPt); } return true; } bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( - MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - B.setInstr(MI); - - const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); - if (!Arg) { - LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); + if (!Arg) return false; - } - if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { - MI.eraseFromParent(); - return true; - } + if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) + return false; - return false; + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); LLT S16 = LLT::scalar(16); @@ -1933,6 +2529,284 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, return false; } +void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, + Register DstReg, + Register X, + Register Y, + bool IsDiv) const { + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + + // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the + // algorithm used here. + + // Initial estimate of inv(y). + auto FloatY = B.buildUITOFP(S32, Y); + auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); + auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); + auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); + auto Z = B.buildFPTOUI(S32, ScaledY); + + // One round of UNR. + auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); + auto NegYZ = B.buildMul(S32, NegY, Z); + Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); + + // Quotient/remainder estimate. + auto Q = B.buildUMulH(S32, X, Z); + auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); + + // First quotient/remainder refinement. + auto One = B.buildConstant(S32, 1); + auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); + if (IsDiv) + Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); + R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); + + // Second quotient/remainder refinement. + Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); + if (IsDiv) + B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); + else + B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); +} + +bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; + Register DstReg = MI.getOperand(0).getReg(); + Register Num = MI.getOperand(1).getReg(); + Register Den = MI.getOperand(2).getReg(); + legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); + MI.eraseFromParent(); + return true; +} + +// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 +// +// Return lo, hi of result +// +// %cvt.lo = G_UITOFP Val.lo +// %cvt.hi = G_UITOFP Val.hi +// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo +// %rcp = G_AMDGPU_RCP_IFLAG %mad +// %mul1 = G_FMUL %rcp, 0x5f7ffffc +// %mul2 = G_FMUL %mul1, 2**(-32) +// %trunc = G_INTRINSIC_TRUNC %mul2 +// %mad2 = G_FMAD %trunc, -(2**32), %mul1 +// return {G_FPTOUI %mad2, G_FPTOUI %trunc} +static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, + Register Val) { + const LLT S32 = LLT::scalar(32); + auto Unmerge = B.buildUnmerge(S32, Val); + + auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); + auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); + + auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 + B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); + + auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); + auto Mul1 = + B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); + + // 2**(-32) + auto Mul2 = + B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); + auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); + + // -(2**32) + auto Mad2 = B.buildFMAD(S32, Trunc, + B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); + + auto ResultLo = B.buildFPTOUI(S32, Mad2); + auto ResultHi = B.buildFPTOUI(S32, Trunc); + + return {ResultLo.getReg(0), ResultHi.getReg(0)}; +} + +void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, + Register DstReg, + Register Numer, + Register Denom, + bool IsDiv) const { + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT S1 = LLT::scalar(1); + Register RcpLo, RcpHi; + + std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); + + auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); + + auto Zero64 = B.buildConstant(S64, 0); + auto NegDenom = B.buildSub(S64, Zero64, Denom); + + auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); + auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); + + auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); + Register MulHi1_Lo = UnmergeMulHi1.getReg(0); + Register MulHi1_Hi = UnmergeMulHi1.getReg(1); + + auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); + auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); + auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); + auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); + + auto MulLo2 = B.buildMul(S64, NegDenom, Add1); + auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); + auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); + Register MulHi2_Lo = UnmergeMulHi2.getReg(0); + Register MulHi2_Hi = UnmergeMulHi2.getReg(1); + + auto Zero32 = B.buildConstant(S32, 0); + auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); + auto Add2_HiC = + B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); + auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); + auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); + + auto UnmergeNumer = B.buildUnmerge(S32, Numer); + Register NumerLo = UnmergeNumer.getReg(0); + Register NumerHi = UnmergeNumer.getReg(1); + + auto MulHi3 = B.buildUMulH(S64, Numer, Add2); + auto Mul3 = B.buildMul(S64, Denom, MulHi3); + auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); + Register Mul3_Lo = UnmergeMul3.getReg(0); + Register Mul3_Hi = UnmergeMul3.getReg(1); + auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); + auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); + auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); + auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); + + auto UnmergeDenom = B.buildUnmerge(S32, Denom); + Register DenomLo = UnmergeDenom.getReg(0); + Register DenomHi = UnmergeDenom.getReg(1); + + auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); + auto C1 = B.buildSExt(S32, CmpHi); + + auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); + auto C2 = B.buildSExt(S32, CmpLo); + + auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); + auto C3 = B.buildSelect(S32, CmpEq, C2, C1); + + // TODO: Here and below portions of the code can be enclosed into if/endif. + // Currently control flow is unconditional and we have 4 selects after + // potential endif to substitute PHIs. + + // if C3 != 0 ... + auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); + auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); + auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); + auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); + + auto One64 = B.buildConstant(S64, 1); + auto Add3 = B.buildAdd(S64, MulHi3, One64); + + auto C4 = + B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); + auto C5 = + B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); + auto C6 = B.buildSelect( + S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); + + // if (C6 != 0) + auto Add4 = B.buildAdd(S64, Add3, One64); + auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); + + auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); + auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); + auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); + + // endif C6 + // endif C3 + + if (IsDiv) { + auto Sel1 = B.buildSelect( + S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); + B.buildSelect(DstReg, + B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); + } else { + auto Sel2 = B.buildSelect( + S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); + B.buildSelect(DstReg, + B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); + } +} + +bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; + Register DstReg = MI.getOperand(0).getReg(); + Register Num = MI.getOperand(1).getReg(); + Register Den = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(DstReg); + + if (Ty == S32) + legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); + else if (Ty == S64) + legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); + else + return false; + + MI.eraseFromParent(); + return true; + +} + +bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + if (Ty != S32 && Ty != S64) + return false; + + const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; + + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); + auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); + auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); + + LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); + RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); + + LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); + RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); + + Register UDivRem = MRI.createGenericVirtualRegister(Ty); + if (Ty == S32) + legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); + else + legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); + + Register Sign; + if (IsDiv) + Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); + else + Sign = LHSign.getReg(0); // Remainder sign is the same as LHS + + UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); + B.buildSub(DstReg, UDivRem, Sign); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1954,7 +2828,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, return false; if (!Unsafe && ResTy == S32 && - MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) + MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -1997,7 +2871,6 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2035,15 +2908,13 @@ static void toggleSPDenormMode(bool Enable, AMDGPU::SIModeRegisterDefaults Mode) { // Set SP denorm mode to this value. unsigned SPDenormMode = - Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); if (ST.hasDenormModeInst()) { // Preserve default FP64FP16 denorm mode while updating FP32 mode. - unsigned DPDenormModeDefault = Mode.FP64FP16Denormals - ? FP_DENORM_FLUSH_NONE - : FP_DENORM_FLUSH_IN_FLUSH_OUT; + uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); - unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); + uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); B.buildInstr(AMDGPU::S_DENORM_MODE) .addImm(NewDenormModeValue); @@ -2062,7 +2933,6 @@ static void toggleSPDenormMode(bool Enable, bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2078,15 +2948,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto DenominatorScaled = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) - .addUse(RHS) .addUse(LHS) - .addImm(1) + .addUse(RHS) + .addImm(0) .setMIFlags(Flags); auto NumeratorScaled = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(0) + .addImm(1) .setMIFlags(Flags); auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) @@ -2096,7 +2966,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, // FIXME: Doesn't correctly model the FP mode switch, and the FP operations // aren't modeled as reading it. - if (!Mode.FP32Denormals) + if (!Mode.allFP32Denormals()) toggleSPDenormMode(true, B, ST, Mode); auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); @@ -2106,7 +2976,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); - if (!Mode.FP32Denormals) + if (!Mode.allFP32Denormals()) toggleSPDenormMode(false, B, ST, Mode); auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) @@ -2129,7 +2999,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2144,7 +3013,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(1) + .addImm(0) .setMIFlags(Flags); auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); @@ -2160,11 +3029,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(0) + .addImm(1) .setMIFlags(Flags); auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); - auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); + auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); Register Scale; @@ -2172,8 +3041,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. - Scale = MRI.createGenericVirtualRegister(S1); - LLT S32 = LLT::scalar(32); auto NumUnmerge = B.buildUnmerge(S32, LHS); @@ -2185,7 +3052,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, Scale1Unmerge.getReg(1)); auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), Scale0Unmerge.getReg(1)); - B.buildXor(Scale, CmpNum, CmpDen); + Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); } else { Scale = DivScale1.getReg(1); } @@ -2210,7 +3077,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(2).getReg(); Register RHS = MI.getOperand(3).getReg(); @@ -2252,8 +3118,6 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } - B.setInstr(MI); - uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset( B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); @@ -2263,8 +3127,9 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, const ArgDescriptor *Arg; const TargetRegisterClass *RC; - std::tie(Arg, RC) - = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + LLT ArgTy; + std::tie(Arg, RC, ArgTy) = + MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); if (!Arg) return false; @@ -2281,7 +3146,6 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const { - B.setInstr(MI); Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); @@ -2289,6 +3153,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, return true; } +// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: +// offset (the offset that is included in bounds checking and swizzling, to be +// split between the instruction's voffset and immoffset fields) and soffset +// (the offset that is excluded from bounds checking and swizzling, to go in +// the instruction's soffset field). This function takes the first kind of +// offset and figures out how to split it between voffset and immoffset. +std::tuple<Register, unsigned, unsigned> +AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + const LLT S32 = LLT::scalar(32); + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + if (!BaseReg) { + BaseReg = B.buildConstant(S32, Overflow).getReg(0); + } else { + auto OverflowVal = B.buildConstant(S32, Overflow); + BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); + } + } + + if (!BaseReg) + BaseReg = B.buildConstant(S32, 0).getReg(0); + + return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); +} + /// Handle register layout difference for f16 images for some subtargets. Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, @@ -2312,75 +3225,969 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); } -bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - bool IsFormat) const { - // TODO: Reject f16 format on targets where unsupported. - Register VData = MI.getOperand(1).getReg(); - LLT Ty = MRI.getType(VData); +Register AMDGPULegalizerInfo::fixStoreSourceType( + MachineIRBuilder &B, Register VData, bool IsFormat) const { + MachineRegisterInfo *MRI = B.getMRI(); + LLT Ty = MRI->getType(VData); - B.setInstr(MI); - - const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); // Fixup illegal register types for i8 stores. if (Ty == LLT::scalar(8) || Ty == S16) { Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); - MI.getOperand(1).setReg(AnyExt); - return true; + return AnyExt; } if (Ty.isVector()) { if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { if (IsFormat) - MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); + return handleD16VData(B, *MRI, VData); + } + } + + return VData; +} + +bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsTyped, + bool IsFormat) const { + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + LLT EltTy = Ty.getScalarType(); + const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); + const LLT S32 = LLT::scalar(32); + + VData = fixStoreSourceType(B, VData, IsFormat); + Register RSrc = MI.getOperand(2).getReg(); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + unsigned ImmOffset; + unsigned TotalOffset; + + // The typed intrinsics add an immediate after the registers. + const unsigned NumVIndexOps = IsTyped ? 8 : 7; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(3).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(3 + OpOffset).getReg(); + Register SOffset = MI.getOperand(4 + OpOffset).getReg(); + + unsigned Format = 0; + if (IsTyped) { + Format = MI.getOperand(5 + OpOffset).getImm(); + ++OpOffset; + } + + unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + + unsigned Opc; + if (IsTyped) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : + AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; + } else if (IsFormat) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : + AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; + } else { + switch (MemSize) { + case 1: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; + break; + case 2: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; + break; + default: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; + break; + } + } + + if (!VIndex) + VIndex = B.buildConstant(S32, 0).getReg(0); + + auto MIB = B.buildInstr(Opc) + .addUse(VData) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset); // offset(imm) + + if (IsTyped) + MIB.addImm(Format); + + MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsFormat, + bool IsTyped) const { + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + const LLT S32 = LLT::scalar(32); + + Register Dst = MI.getOperand(0).getReg(); + Register RSrc = MI.getOperand(2).getReg(); + + // The typed intrinsics add an immediate after the registers. + const unsigned NumVIndexOps = IsTyped ? 8 : 7; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(3).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(3 + OpOffset).getReg(); + Register SOffset = MI.getOperand(4 + OpOffset).getReg(); + + unsigned Format = 0; + if (IsTyped) { + Format = MI.getOperand(5 + OpOffset).getImm(); + ++OpOffset; + } + + unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + LLT Ty = MRI.getType(Dst); + LLT EltTy = Ty.getScalarType(); + const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); + const bool Unpacked = ST.hasUnpackedD16VMem(); + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + + unsigned Opc; + + if (IsTyped) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : + AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; + } else if (IsFormat) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : + AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; + } else { + switch (MemSize) { + case 1: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; + break; + case 2: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; + break; + default: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; + break; + } + } + + Register LoadDstReg; + + bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); + LLT UnpackedTy = Ty.changeElementSize(32); + + if (IsExtLoad) + LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); + else if (Unpacked && IsD16 && Ty.isVector()) + LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); + else + LoadDstReg = Dst; + + if (!VIndex) + VIndex = B.buildConstant(S32, 0).getReg(0); + + auto MIB = B.buildInstr(Opc) + .addDef(LoadDstReg) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset); // offset(imm) + + if (IsTyped) + MIB.addImm(Format); + + MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + if (LoadDstReg != Dst) { + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + + // Widen result for extending loads was widened. + if (IsExtLoad) + B.buildTrunc(Dst, LoadDstReg); + else { + // Repack to original 16-bit vector result + // FIXME: G_TRUNC should work, but legalization currently fails + auto Unmerge = B.buildUnmerge(S32, LoadDstReg); + SmallVector<Register, 4> Repack; + for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) + Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); + B.buildMerge(Dst, Repack); + } + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, + MachineIRBuilder &B, + bool IsInc) const { + unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : + AMDGPU::G_AMDGPU_ATOMIC_DEC; + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(2).getReg()) + .addUse(MI.getOperand(3).getReg()) + .cloneMemRefs(MI); + MI.eraseFromParent(); + return true; +} + +static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + default: + llvm_unreachable("unhandled atomic opcode"); + } +} + +bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, + MachineIRBuilder &B, + Intrinsic::ID IID) const { + const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + + Register Dst = MI.getOperand(0).getReg(); + Register VData = MI.getOperand(2).getReg(); + + Register CmpVal; + int OpOffset = 0; + + if (IsCmpSwap) { + CmpVal = MI.getOperand(3 + OpOffset).getReg(); + ++OpOffset; + } + + Register RSrc = MI.getOperand(3 + OpOffset).getReg(); + const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + if (HasVIndex) { + VIndex = MI.getOperand(4 + OpOffset).getReg(); + ++OpOffset; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Register SOffset = MI.getOperand(5 + OpOffset).getReg(); + unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + + unsigned ImmOffset; + unsigned TotalOffset; + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); + + if (!VIndex) + VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); + + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) + .addDef(Dst) + .addUse(VData); // vdata + + if (IsCmpSwap) + MIB.addReg(CmpVal); + + MIB.addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset) // offset(imm) + .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + +/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized +/// vector with s16 typed elements. +static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, + SmallVectorImpl<Register> &PackedAddrs, + int AddrIdx, int DimIdx, int EndIdx, + int NumGradients) { + const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); + + for (int I = AddrIdx; I < EndIdx; ++I) { + MachineOperand &SrcOp = MI.getOperand(I); + if (!SrcOp.isReg()) + continue; // _L to _LZ may have eliminated this. + + Register AddrReg = SrcOp.getReg(); + + if (I < DimIdx) { + AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); + PackedAddrs.push_back(AddrReg); + } else { + // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, + // derivatives dx/dh and dx/dv are packed with undef. + if (((I + 1) >= EndIdx) || + ((NumGradients / 2) % 2 == 1 && + (I == DimIdx + (NumGradients / 2) - 1 || + I == DimIdx + NumGradients - 1)) || + // Check for _L to _LZ optimization + !MI.getOperand(I + 1).isReg()) { + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) + .getReg(0)); + } else { + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) + .getReg(0)); + ++I; + } + } + } +} + +/// Convert from separate vaddr components to a single vector address register, +/// and replace the remaining operands with $noreg. +static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, + int DimIdx, int NumVAddrs) { + const LLT S32 = LLT::scalar(32); + + SmallVector<Register, 8> AddrRegs; + for (int I = 0; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(DimIdx + I); + if (SrcOp.isReg()) { + AddrRegs.push_back(SrcOp.getReg()); + assert(B.getMRI()->getType(SrcOp.getReg()) == S32); + } + } + + int NumAddrRegs = AddrRegs.size(); + if (NumAddrRegs != 1) { + // Round up to 8 elements for v5-v7 + // FIXME: Missing intermediate sized register classes and instructions. + if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { + const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); + auto Undef = B.buildUndef(S32); + AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); + NumAddrRegs = RoundedNumRegs; + } + + auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); + MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); + } + + for (int I = 1; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(DimIdx + I); + if (SrcOp.isReg()) + MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); + } +} + +/// Rewrite image intrinsics to use register layouts expected by the subtarget. +/// +/// Depending on the subtarget, load/store with 16-bit element data need to be +/// rewritten to use the low half of 32-bit registers, or directly use a packed +/// layout. 16-bit addresses should also sometimes be packed into 32-bit +/// registers. +/// +/// We don't want to directly select image instructions just yet, but also want +/// to exposes all register repacking to the legalizer/combiners. We also don't +/// want a selected instrution entering RegBankSelect. In order to avoid +/// defining a multitude of intermediate image instructions, directly hack on +/// the intrinsic's arguments. In cases like a16 addreses, this requires padding +/// now unnecessary arguments with $noreg. +bool AMDGPULegalizerInfo::legalizeImageIntrinsic( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { + + const int NumDefs = MI.getNumExplicitDefs(); + bool IsTFE = NumDefs == 2; + // We are only processing the operands of d16 image operations on subtargets + // that use the unpacked register layout, or need to repack the TFE result. + + // TODO: Do we need to guard against already legalized intrinsics? + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); + + MachineRegisterInfo *MRI = B.getMRI(); + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); + + // Index of first address argument + const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); + + int NumVAddrs, NumGradients; + std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); + const int DMaskIdx = BaseOpcode->Atomic ? -1 : + getDMaskIdx(BaseOpcode, NumDefs); + unsigned DMask = 0; + + // Check for 16 bit addresses and pack if true. + int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); + LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); + const bool IsG16 = GradTy == S16; + const bool IsA16 = AddrTy == S16; + + int DMaskLanes = 0; + if (!BaseOpcode->Atomic) { + DMask = MI.getOperand(DMaskIdx).getImm(); + if (BaseOpcode->Gather4) { + DMaskLanes = 4; + } else if (DMask != 0) { + DMaskLanes = countPopulation(DMask); + } else if (!IsTFE && !BaseOpcode->Store) { + // If dmask is 0, this is a no-op load. This can be eliminated. + B.buildUndef(MI.getOperand(0)); + MI.eraseFromParent(); return true; } + } + + Observer.changingInstr(MI); + auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); + + unsigned NewOpcode = NumDefs == 0 ? + AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + + // Track that we legalized this + MI.setDesc(B.getTII().get(NewOpcode)); + + // Expecting to get an error flag since TFC is on - and dmask is 0 Force + // dmask to be at least 1 otherwise the instruction will fail + if (IsTFE && DMask == 0) { + DMask = 0x1; + DMaskLanes = 1; + MI.getOperand(DMaskIdx).setImm(DMask); + } + + if (BaseOpcode->Atomic) { + Register VData0 = MI.getOperand(2).getReg(); + LLT Ty = MRI->getType(VData0); + + // TODO: Allow atomic swap and bit ops for v2s16/v4s16 + if (Ty.isVector()) + return false; + + if (BaseOpcode->AtomicX2) { + Register VData1 = MI.getOperand(3).getReg(); + // The two values are packed in one register. + LLT PackedTy = LLT::vector(2, Ty); + auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); + MI.getOperand(2).setReg(Concat.getReg(0)); + MI.getOperand(3).setReg(AMDGPU::NoRegister); + } + } - return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; + int CorrectedNumVAddrs = NumVAddrs; + + // Optimize _L to _LZ when _L is zero + if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { + const ConstantFP *ConstantLod; + const int LodIdx = AddrIdx + NumVAddrs - 1; + + if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + // Set new opcode to _lz variant of _l, and change the intrinsic ID. + ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( + LZMappingInfo->LZ, ImageDimIntr->Dim); + + // The starting indexes should remain in the same place. + --NumVAddrs; + --CorrectedNumVAddrs; + + MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( + static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); + MI.RemoveOperand(LodIdx); + } + } } - return Ty == S32; + // Optimize _mip away, when 'lod' is zero + if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { + int64_t ConstantLod; + const int LodIdx = AddrIdx + NumVAddrs - 1; + + if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { + if (ConstantLod == 0) { + // TODO: Change intrinsic opcode and remove operand instead or replacing + // it with 0, as the _L to _LZ handling is done above. + MI.getOperand(LodIdx).ChangeToImmediate(0); + --CorrectedNumVAddrs; + } + } + } + + // Rewrite the addressing register layout before doing anything else. + if (IsA16 || IsG16) { + if (IsA16) { + // Target must support the feature and gradients need to be 16 bit too + if (!ST.hasA16() || !IsG16) + return false; + } else if (!ST.hasG16()) + return false; + + if (NumVAddrs > 1) { + SmallVector<Register, 4> PackedRegs; + // Don't compress addresses for G16 + const int PackEndIdx = + IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); + packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { + int AddrReg = MI.getOperand(I).getReg(); + assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); + PackedRegs.push_back(AddrReg); + } + } + + // See also below in the non-a16 branch + const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); + + if (!UseNSA && PackedRegs.size() > 1) { + LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); + auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); + PackedRegs[0] = Concat.getReg(0); + PackedRegs.resize(1); + } + + const int NumPacked = PackedRegs.size(); + for (int I = 0; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); + if (!SrcOp.isReg()) { + assert(SrcOp.isImm() && SrcOp.getImm() == 0); + continue; + } + + assert(SrcOp.getReg() != AMDGPU::NoRegister); + + if (I < NumPacked) + SrcOp.setReg(PackedRegs[I]); + else + SrcOp.setReg(AMDGPU::NoRegister); + } + } + } else { + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator + // that MIMG addresses should be placed contiguously when it is possible to + // do so, so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); + + if (!UseNSA && NumVAddrs > 1) + convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); + } + + int Flags = 0; + if (IsA16) + Flags |= 1; + if (IsG16) + Flags |= 2; + MI.addOperand(MachineOperand::CreateImm(Flags)); + + if (BaseOpcode->Store) { // No TFE for stores? + // TODO: Handle dmask trim + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI->getType(VData); + if (!Ty.isVector() || Ty.getElementType() != S16) + return true; + + Register RepackedReg = handleD16VData(B, *MRI, VData); + if (RepackedReg != VData) { + MI.getOperand(1).setReg(RepackedReg); + } + + return true; + } + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI->getType(DstReg); + const LLT EltTy = Ty.getScalarType(); + const bool IsD16 = Ty.getScalarType() == S16; + const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + + // Confirm that the return type is large enough for the dmask specified + if (NumElts < DMaskLanes) + return false; + + if (NumElts > 4 || DMaskLanes > 4) + return false; + + const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; + const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); + + // The raw dword aligned data component of the load. The only legal cases + // where this matters should be when using the packed D16 format, for + // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, + LLT RoundedTy; + + // S32 vector to to cover all data, plus TFE result element. + LLT TFETy; + + // Register type to use for each loaded component. Will be S32 or V2S16. + LLT RegTy; + + if (IsD16 && ST.hasUnpackedD16VMem()) { + RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); + TFETy = LLT::vector(AdjustedNumElts + 1, 32); + RegTy = S32; + } else { + unsigned EltSize = EltTy.getSizeInBits(); + unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; + unsigned RoundedSize = 32 * RoundedElts; + RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); + TFETy = LLT::vector(RoundedSize / 32 + 1, S32); + RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; + } + + // The return type does not need adjustment. + // TODO: Should we change s16 case to s32 or <2 x s16>? + if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) + return true; + + Register Dst1Reg; + + // Insert after the instruction. + B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + + // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x + // s16> instead of s32, we would only need 1 bitcast instead of multiple. + const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; + const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; + + Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); + + MI.getOperand(0).setReg(NewResultReg); + + // In the IR, TFE is supposed to be used with a 2 element struct return + // type. The intruction really returns these two values in one contiguous + // register, with one additional dword beyond the loaded data. Rewrite the + // return type to use a single register result. + + if (IsTFE) { + Dst1Reg = MI.getOperand(1).getReg(); + if (MRI->getType(Dst1Reg) != S32) + return false; + + // TODO: Make sure the TFE operand bit is set. + MI.RemoveOperand(1); + + // Handle the easy case that requires no repack instructions. + if (Ty == S32) { + B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); + return true; + } + } + + // Now figure out how to copy the new result register back into the old + // result. + SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); + + const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; + + if (ResultNumRegs == 1) { + assert(!IsTFE); + ResultRegs[0] = NewResultReg; + } else { + // We have to repack into a new vector of some kind. + for (int I = 0; I != NumDataRegs; ++I) + ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); + B.buildUnmerge(ResultRegs, NewResultReg); + + // Drop the final TFE element to get the data part. The TFE result is + // directly written to the right place already. + if (IsTFE) + ResultRegs.resize(NumDataRegs); + } + + // For an s16 scalar result, we form an s32 result with a truncate regardless + // of packed vs. unpacked. + if (IsD16 && !Ty.isVector()) { + B.buildTrunc(DstReg, ResultRegs[0]); + return true; + } + + // Avoid a build/concat_vector of 1 entry. + if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { + B.buildBitcast(DstReg, ResultRegs[0]); + return true; + } + + assert(Ty.isVector()); + + if (IsD16) { + // For packed D16 results with TFE enabled, all the data components are + // S32. Cast back to the expected type. + // + // TODO: We don't really need to use load s32 elements. We would only need one + // cast for the TFE result if a multiple of v2s16 was used. + if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildBitcast(V2S16, Reg).getReg(0); + } else if (ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildTrunc(S16, Reg).getReg(0); + } + } + + auto padWithUndef = [&](LLT Ty, int NumElts) { + if (NumElts == 0) + return; + Register Undef = B.buildUndef(Ty).getReg(0); + for (int I = 0; I != NumElts; ++I) + ResultRegs.push_back(Undef); + }; + + // Pad out any elements eliminated due to the dmask. + LLT ResTy = MRI->getType(ResultRegs[0]); + if (!ResTy.isVector()) { + padWithUndef(ResTy, NumElts - ResultRegs.size()); + B.buildBuildVector(DstReg, ResultRegs); + return true; + } + + assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); + const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; + + // Deal with the one annoying legal case. + const LLT V3S16 = LLT::vector(3, 16); + if (Ty == V3S16) { + padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); + auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); + B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + return true; + } + + padWithUndef(ResTy, RegsToCover - ResultRegs.size()); + B.buildConcatVectors(DstReg, ResultRegs); + return true; } -bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeSBufferLoad( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer) const { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Size = Ty.getSizeInBits(); + MachineFunction &MF = B.getMF(); + + Observer.changingInstr(MI); + + // FIXME: We don't really need this intermediate instruction. The intrinsic + // should be fixed to have a memory operand. Since it's readnone, we're not + // allowed to add one. + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); + MI.RemoveOperand(1); // Remove intrinsic ID + + // FIXME: When intrinsic definition is fixed, this should have an MMO already. + // TODO: Should this use datalayout alignment? + const unsigned MemSize = (Size + 7) / 8; + const Align MemAlign(4); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + MemSize, MemAlign); + MI.addMemOperand(MF, MMO); + + // There are no 96-bit result scalar loads, but widening to 128-bit should + // always be legal. We may need to restore this to a 96-bit result if it turns + // out this needs to be converted to a vector load during RegBankSelect. + if (!isPowerOf2_32(Size)) { + LegalizerHelper Helper(MF, *this, Observer, B); + + if (Ty.isVector()) + Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); + else + Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); + } + + Observer.changedInstr(MI); + return true; +} + +bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction + if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !ST.isTrapHandlerEnabled()) { + B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + } else { + // Pass queue pointer to trap handler as input, and insert trap instruction + // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi + const ArgDescriptor *Arg = + getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); + if (!Arg) + return false; + MachineRegisterInfo &MRI = *B.getMRI(); + Register SGPR01(AMDGPU::SGPR0_SGPR1); + Register LiveIn = getLiveInRegister( + B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), + /*InsertLiveInCopy=*/false); + if (!loadInputValue(LiveIn, B, Arg)) + return false; + B.buildCopy(SGPR01, LiveIn); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(GCNSubtarget::TrapIDLLVMTrap) + .addReg(SGPR01, RegState::Implicit); + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + // Is non-HSA path or trap-handler disabled? then, report a warning + // accordingly + if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !ST.isTrapHandlerEnabled()) { + DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), + "debugtrap handler not supported", + MI.getDebugLoc(), DS_Warning); + LLVMContext &Ctx = B.getMF().getFunction().getContext(); + Ctx.diagnose(NoTrap); + } else { + // Insert debug-trap instruction + B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + // Replace the use G_BRCOND with the exec manipulate and branch pseudos. auto IntrID = MI.getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: { MachineInstr *Br = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { + MachineBasicBlock *UncondBrTarget = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); - B.setInstr(*BrCond); Register Def = MI.getOperand(1).getReg(); Register Use = MI.getOperand(3).getReg(); - MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); - if (Br) - BrTarget = Br->getOperand(0).getMBB(); - + MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); if (IntrID == Intrinsic::amdgcn_if) { B.buildInstr(AMDGPU::SI_IF) .addDef(Def) .addUse(Use) - .addMBB(BrTarget); + .addMBB(UncondBrTarget); } else { B.buildInstr(AMDGPU::SI_ELSE) .addDef(Def) .addUse(Use) - .addMBB(BrTarget) + .addMBB(UncondBrTarget) .addImm(0); } - if (Br) - Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); + if (Br) { + Br->getOperand(0).setMBB(CondBrTarget); + } else { + // The IRTranslator skips inserting the G_BR for fallthrough cases, but + // since we're swapping branch targets it needs to be reinserted. + // FIXME: IRTranslator should probably not do this + B.buildBr(*CondBrTarget); + } MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); @@ -2393,17 +4200,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, } case Intrinsic::amdgcn_loop: { MachineInstr *Br = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { + MachineBasicBlock *UncondBrTarget = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); - B.setInstr(*BrCond); - - // FIXME: Need to adjust branch targets based on unconditional branch. + MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); Register Reg = MI.getOperand(2).getReg(); + + B.setInsertPt(B.getMBB(), BrCond->getIterator()); B.buildInstr(AMDGPU::SI_LOOP) .addUse(Reg) - .addMBB(BrCond->getOperand(1).getMBB()); + .addMBB(UncondBrTarget); + + if (Br) + Br->getOperand(0).setMBB(CondBrTarget); + else + B.buildBr(*CondBrTarget); + MI.eraseFromParent(); BrCond->eraseFromParent(); MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); @@ -2413,6 +4227,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, return false; } case Intrinsic::amdgcn_kernarg_segment_ptr: + if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { + // This only makes sense to call in a kernel, so just lower to null. + B.buildConstant(MI.getOperand(0).getReg(), 0); + MI.eraseFromParent(); + return true; + } + return legalizePreloadedArgIntrin( MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); case Intrinsic::amdgcn_implicitarg_ptr: @@ -2454,18 +4275,72 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, case Intrinsic::amdgcn_is_private: return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); case Intrinsic::amdgcn_wavefrontsize: { - B.setInstr(MI); B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); MI.eraseFromParent(); return true; } + case Intrinsic::amdgcn_s_buffer_load: + return legalizeSBufferLoad(MI, B, Helper.Observer); case Intrinsic::amdgcn_raw_buffer_store: - return legalizeRawBufferStore(MI, MRI, B, false); + case Intrinsic::amdgcn_struct_buffer_store: + return legalizeBufferStore(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_store_format: - return legalizeRawBufferStore(MI, MRI, B, true); - default: + case Intrinsic::amdgcn_struct_buffer_store_format: + return legalizeBufferStore(MI, MRI, B, false, true); + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + return legalizeBufferStore(MI, MRI, B, true, true); + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load: + return legalizeBufferLoad(MI, MRI, B, false, false); + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_struct_buffer_load_format: + return legalizeBufferLoad(MI, MRI, B, true, false); + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_struct_tbuffer_load: + return legalizeBufferLoad(MI, MRI, B, true, true); + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + return legalizeBufferAtomic(MI, B, IntrID); + case Intrinsic::amdgcn_atomic_inc: + return legalizeAtomicIncDec(MI, B, true); + case Intrinsic::amdgcn_atomic_dec: + return legalizeAtomicIncDec(MI, B, false); + case Intrinsic::trap: + return legalizeTrapIntrinsic(MI, MRI, B); + case Intrinsic::debugtrap: + return legalizeDebugTrapIntrinsic(MI, MRI, B); + default: { + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrID)) + return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); return true; } + } return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 4b1405a92787a..ce32bbf76b34f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -32,9 +32,7 @@ public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); - bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, - GISelChangeObserver &Observer) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, @@ -50,18 +48,22 @@ public: MachineIRBuilder &B) const; bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const; - bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const; + bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool Signed) const; + bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool buildPCRelGlobalAddress( - Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, - unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const; + bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, + const GlobalValue *GV, int64_t Offset, + unsigned GAFlags = SIInstrInfo::MO_NONE) const; bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -74,16 +76,50 @@ public: bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B, + double Log2BaseInverted) const; + bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; - Register getLiveInRegister(MachineRegisterInfo &MRI, - Register Reg, LLT Ty) const; + bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + Register getLiveInRegister(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register PhyReg, LLT Ty, + bool InsertLiveInCopy = true) const; + Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI, + Register LiveIn, Register PhyReg) const; + const ArgDescriptor * + getArgDescriptor(MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg) const; bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + void legalizeUDIV_UREM32Impl(MachineIRBuilder &B, + Register DstReg, Register Num, Register Den, + bool IsRem) const; + bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + void legalizeUDIV_UREM64Impl(MachineIRBuilder &B, + Register DstReg, Register Numer, Register Denom, + bool IsDiv) const; + + bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -102,13 +138,46 @@ public: bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const; + std::tuple<Register, unsigned, unsigned> + splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat) const; - bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const override; + bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsFormat) const; + Register fixStoreSourceType(MachineIRBuilder &B, Register VData, + bool IsFormat) const; + + bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsTyped, + bool IsFormat) const; + bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, bool IsTyped, + bool IsFormat) const; + bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, + Intrinsic::ID IID) const; + + bool legalizeImageIntrinsic( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const; + + bool legalizeSBufferLoad( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer) const; + + bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, + bool IsInc) const; + + bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const override; }; } // End llvm namespace. #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 0c56927dea02b..4a14259f1bdb1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" #include <cmath> #include <vector> @@ -170,16 +169,13 @@ namespace { class AMDGPUSimplifyLibCalls : public FunctionPass { - const TargetOptions Options; - AMDGPULibCalls Simplifier; public: static char ID; // Pass identification - AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(), - const TargetMachine *TM = nullptr) - : FunctionPass(ID), Options(Opt), Simplifier(TM) { + AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr) + : FunctionPass(ID), Simplifier(TM) { initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -585,7 +581,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, assert(Callee->hasName() && "Invalid read_pipe/write_pipe function"); auto *M = Callee->getParent(); auto &Ctx = M->getContext(); - std::string Name = Callee->getName(); + std::string Name = std::string(Callee->getName()); auto NumArg = CI->getNumArgOperands(); if (NumArg != 4 && NumArg != 6) return false; @@ -594,15 +590,15 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign)) return false; unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue(); - unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue(); - if (Size != Align || !isPowerOf2_32(Size)) + Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue(); + if (Alignment != Size) return false; Type *PtrElemTy; if (Size <= 8) PtrElemTy = Type::getIntNTy(Ctx, Size * 8); else - PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8); + PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8); unsigned PtrArgLoc = CI->getNumArgOperands() - 3; auto PtrArg = CI->getArgOperand(PtrArgLoc); unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace(); @@ -1130,8 +1126,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, Type* rTy = opr0->getType(); Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty(); Type *nTy = nTyS; - if (const VectorType *vTy = dyn_cast<VectorType>(rTy)) - nTy = VectorType::get(nTyS, vTy->getNumElements()); + if (const auto *vTy = dyn_cast<FixedVectorType>(rTy)) + nTy = FixedVectorType::get(nTyS, vTy); unsigned size = nTy->getScalarSizeInBits(); opr_n = CI->getArgOperand(1); if (opr_n->getType()->isIntegerTy()) @@ -1420,8 +1416,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, B.SetInsertPoint(&*ItNew); AllocaInst *Alloc = B.CreateAlloca(RetType, 0, std::string(prefix) + UI->getName()); - Alloc->setAlignment(MaybeAlign( - UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); + Alloc->setAlignment( + Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); return Alloc; } @@ -1711,35 +1707,14 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) { } // Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt, - const TargetMachine *TM) { - return new AMDGPUSimplifyLibCalls(Opt, TM); +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) { + return new AMDGPUSimplifyLibCalls(TM); } FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { return new AMDGPUUseNativeCalls(); } -static bool setFastFlags(Function &F, const TargetOptions &Options) { - AttrBuilder B; - - if (Options.UnsafeFPMath || Options.NoInfsFPMath) - B.addAttribute("no-infs-fp-math", "true"); - if (Options.UnsafeFPMath || Options.NoNaNsFPMath) - B.addAttribute("no-nans-fp-math", "true"); - if (Options.UnsafeFPMath) { - B.addAttribute("less-precise-fpmad", "true"); - B.addAttribute("unsafe-fp-math", "true"); - } - - if (!B.hasAttributes()) - return false; - - F.addAttributes(AttributeList::FunctionIndex, B); - - return true; -} - bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -1750,15 +1725,14 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { LLVM_DEBUG(dbgs() << "AMDIC: process function "; F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); - if (!EnablePreLink) - Changed |= setFastFlags(F, Options); - for (auto &BB : F) { for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) { // Ignore non-calls. CallInst *CI = dyn_cast<CallInst>(I); ++I; - if (!CI) continue; + // Ignore intrinsics that do not become real instructions. + if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd()) + continue; // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp index e1ae496d9cbca..2b5143ba7506c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -10,17 +10,18 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPU.h" #include "AMDGPULibFunc.h" -#include <llvm/ADT/SmallString.h> -#include <llvm/ADT/SmallVector.h> -#include <llvm/ADT/StringSwitch.h> +#include "AMDGPU.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" -#include <llvm/Support/raw_ostream.h> +#include "llvm/Support/raw_ostream.h" #include <string> using namespace llvm; @@ -479,8 +480,6 @@ static bool eatTerm(StringRef& mangledName, const char (&str)[N]) { return false; } -static inline bool isDigit(char c) { return c >= '0' && c <= '9'; } - static int eatNumber(StringRef& s) { size_t const savedSize = s.size(); int n = 0; @@ -605,7 +604,7 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param, // parse type char const TC = param.front(); - if (::isDigit(TC)) { + if (isDigit(TC)) { res.ArgType = StringSwitch<AMDGPULibFunc::EType> (eatLengthPrefixedName(param)) .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA) @@ -863,7 +862,7 @@ std::string AMDGPUMangledLibFunc::mangleNameItanium() const { Param P; while ((P = I.getNextParam()).ArgType != 0) Mangler(S, P); - return S.str(); + return std::string(S.str()); } /////////////////////////////////////////////////////////////////////////////// @@ -903,7 +902,7 @@ static Type* getIntrinsicParamType( return nullptr; } if (P.VectorSize > 1) - T = VectorType::get(T, P.VectorSize); + T = FixedVectorType::get(T, P.VectorSize); if (P.PtrKind != AMDGPULibFunc::BYVALUE) T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) - 1) @@ -936,7 +935,7 @@ std::string AMDGPUMangledLibFunc::getName() const { SmallString<128> Buf; raw_svector_ostream OS(Buf); writeName(OS); - return OS.str(); + return std::string(OS.str()); } Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h index 2354ed7df2059..c97223b047e88 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -13,6 +13,7 @@ namespace llvm { +class FunctionCallee; class FunctionType; class Function; class Module; @@ -341,7 +342,7 @@ public: /// and unmangled function name for unmangled library functions. virtual std::string mangle() const = 0; - void setName(StringRef N) { Name = N; } + void setName(StringRef N) { Name = std::string(N); } void setPrefix(ENamePrefix pfx) { FKind = pfx; } virtual FunctionType *getFunctionType(Module &M) const = 0; @@ -438,7 +439,7 @@ class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl { public: explicit AMDGPUUnmangledLibFunc(); explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) { - Name = FName; + Name = std::string(FName); FuncTy = FT; } std::string getName() const override { return Name; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 15032969890e2..54c15e4e4d397 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -22,7 +22,15 @@ using namespace llvm; namespace { -const unsigned MaxStaticSize = 1024; +static int MaxStaticSize; + +static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt( + "amdgpu-mem-intrinsic-expand-size", + cl::desc("Set minimum mem intrinsic size to expand in IR"), + cl::location(MaxStaticSize), + cl::init(1024), + cl::Hidden); + class AMDGPULowerIntrinsics : public ModulePass { private: @@ -57,7 +65,7 @@ INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, // require splitting based on alignment) static bool shouldExpandOperationWithSize(Value *Size) { ConstantInt *CI = dyn_cast<ConstantInt>(Size); - return !CI || (CI->getZExtValue() > MaxStaticSize); + return !CI || (CI->getSExtValue() > MaxStaticSize); } bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e64542a395f0e..62ab5bb55a16a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -58,6 +58,21 @@ public: } // end anonymous namespace +// skip allocas +static BasicBlock::iterator getInsertPt(BasicBlock &BB) { + BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); + for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { + AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt); + + // If this is a dynamic alloca, the value may depend on the loaded kernargs, + // so loads will need to be inserted before it. + if (!AI || !AI->isStaticAlloca()) + break; + } + + return InsPt; +} + bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) @@ -70,7 +85,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { LLVMContext &Ctx = F.getParent()->getContext(); const DataLayout &DL = F.getParent()->getDataLayout(); BasicBlock &EntryBlock = *F.begin(); - IRBuilder<> Builder(&*EntryBlock.begin()); + IRBuilder<> Builder(&*getInsertPt(EntryBlock)); const Align KernArgBaseAlign(16); // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); @@ -94,7 +109,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy); + Align ABITypeAlign = DL.getABITypeAlign(ArgTy); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); @@ -120,7 +135,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { continue; } - VectorType *VT = dyn_cast<VectorType>(ArgTy); + auto *VT = dyn_cast<FixedVectorType>(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); @@ -152,7 +167,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } if (IsV3 && Size >= 32) { - V4Ty = VectorType::get(VT->getVectorElementType(), 4); + V4Ty = FixedVectorType::get(VT->getElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads AdjustedArgTy = V4Ty; } @@ -160,7 +175,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); LoadInst *Load = - Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value()); + Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); @@ -210,7 +225,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { Arg.replaceAllUsesWith(NewVal); } else if (IsV3) { Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty), - {0, 1, 2}, + ArrayRef<int>{0, 1, 2}, Arg.getName() + ".load"); Arg.replaceAllUsesWith(Shuf); } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ce7286dabcc8a..99d229c9b74ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -254,7 +254,7 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { return AsmPrinter::lowerConstant(CV); } -void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { +void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; @@ -272,7 +272,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); while (I != MBB->instr_end() && I->isInsideBundle()) { - EmitInstruction(&*I); + emitInstruction(&*I); ++I; } } else { @@ -381,7 +381,7 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } } -void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) { +void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>(); R600MCInstLower MCInstLowering(OutContext, STI, *this); @@ -396,7 +396,7 @@ void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); while (I != MBB->instr_end() && I->isInsideBundle()) { - EmitInstruction(&*I); + emitInstruction(&*I); ++I; } } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 940ddff85d73f..64acd6efe0280 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -15,14 +15,9 @@ using namespace llvm; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), - LocalMemoryObjects(), - ExplicitKernArgSize(0), - LDSSize(0), - Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()), + Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), - NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), - MemoryBound(false), - WaveLimiter(false) { + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, @@ -43,19 +38,18 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, - const GlobalValue &GV) { + const GlobalVariable &GV) { auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0)); if (!Entry.second) return Entry.first->second; - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. - unsigned Offset = LDSSize = alignTo(LDSSize, Align); + unsigned Offset = LDSSize = alignTo(LDSSize, Alignment); Entry.first->second = Offset; LDSSize += DL.getTypeAllocSize(GV.getValueType()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 1933e41c66f36..c504dd76bc658 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -23,26 +23,26 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects; protected: - uint64_t ExplicitKernArgSize; // Cache for this. + uint64_t ExplicitKernArgSize = 0; // Cache for this. Align MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. - unsigned LDSSize; + unsigned LDSSize = 0; // State of MODE register, assumed FP mode. AMDGPU::SIModeRegisterDefaults Mode; // Kernels + shaders. i.e. functions called by the driver and not called // by other functions. - bool IsEntryFunction; + bool IsEntryFunction = false; - bool NoSignedZerosFPMath; + bool NoSignedZerosFPMath = false; // Function may be memory bound. - bool MemoryBound; + bool MemoryBound = false; // Kernel may need limited waves per EU for better performance. - bool WaveLimiter; + bool WaveLimiter = false; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -77,7 +77,7 @@ public: return WaveLimiter; } - unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 8c11230f411a9..b05855d1afc64 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -34,6 +34,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, switch (SecondMI.getOpcode()) { case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_SUBBREV_U32_e64: case AMDGPU::V_CNDMASK_B32_e64: { // Try to cluster defs of condition registers to their uses. This improves // the chance VCC will be available which will allow shrinking to VOP2 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index f7231471c1077..4f9ffa11bc73b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -33,6 +33,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 9613d5a843b38..93079738ef990 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueMap.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -220,9 +221,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { ++FI.InstCount; continue; } - CallSite CS(const_cast<Instruction *>(&I)); - if (CS) { - Function *Callee = CS.getCalledFunction(); + if (auto *CB = dyn_cast<CallBase>(&I)) { + Function *Callee = CB->getCalledFunction(); if (!Callee || Callee->isDeclaration()) { ++FI.InstCount; continue; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp new file mode 100644 index 0000000000000..098b0e9938861 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -0,0 +1,359 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// after the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +#define DEBUG_TYPE "amdgpu-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +struct FMinFMaxLegacyInfo { + Register LHS; + Register RHS; + Register True; + Register False; + CmpInst::Predicate Pred; +}; + +// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize +static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, FMinFMaxLegacyInfo &Info) { + // FIXME: Combines should have subtarget predicates, and we shouldn't need + // this here. + if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) + return false; + + // FIXME: Type predicate on pattern + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) + return false; + + Register Cond = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(Cond) || + !mi_match(Cond, MRI, + m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) + return false; + + Info.True = MI.getOperand(2).getReg(); + Info.False = MI.getOperand(3).getReg(); + + if (!(Info.LHS == Info.True && Info.RHS == Info.False) && + !(Info.LHS == Info.False && Info.RHS == Info.True)) + return false; + + switch (Info.Pred) { + case CmpInst::FCMP_FALSE: + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_ORD: + case CmpInst::FCMP_UNO: + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_UNE: + case CmpInst::FCMP_TRUE: + return false; + default: + return true; + } +} + +static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, + const FMinFMaxLegacyInfo &Info) { + + auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { + MachineIRBuilder MIB(MI); + MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); + }; + + switch (Info.Pred) { + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OLT: { + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + break; + } + case CmpInst::FCMP_UGE: + case CmpInst::FCMP_UGT: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + break; + } + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_OGE: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + break; + } + default: + llvm_unreachable("predicate should not have matched"); + } + + MI.eraseFromParent(); +} + +static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, CombinerHelper &Helper) { + Register DstReg = MI.getOperand(0).getReg(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + LLT Ty = MRI.getType(DstReg); + if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); + assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); + const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); + return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); + } + + return false; +} + +static void applyUCharToFloat(MachineInstr &MI) { + MachineIRBuilder B(MI); + + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = B.getMRI()->getType(DstReg); + LLT SrcTy = B.getMRI()->getType(SrcReg); + if (SrcTy != S32) + SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); + + if (Ty == S32) { + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, + {SrcReg}, MI.getFlags()); + } else { + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, + {SrcReg}, MI.getFlags()); + B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); + } + + MI.eraseFromParent(); +} + +// FIXME: Should be able to have 2 separate matchdatas rather than custom struct +// boilerplate. +struct CvtF32UByteMatchInfo { + Register CvtVal; + unsigned ShiftOffset; +}; + +static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + CvtF32UByteMatchInfo &MatchInfo) { + Register SrcReg = MI.getOperand(1).getReg(); + + // Look through G_ZEXT. + mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); + + Register Src0; + int64_t ShiftAmt; + bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); + if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { + const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; + + unsigned ShiftOffset = 8 * Offset; + if (IsShr) + ShiftOffset += ShiftAmt; + else + ShiftOffset -= ShiftAmt; + + MatchInfo.CvtVal = Src0; + MatchInfo.ShiftOffset = ShiftOffset; + return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; + } + + // TODO: Simplify demanded bits. + return false; +} + +static void applyCvtF32UByteN(MachineInstr &MI, + const CvtF32UByteMatchInfo &MatchInfo) { + MachineIRBuilder B(MI); + unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; + + const LLT S32 = LLT::scalar(32); + Register CvtSrc = MatchInfo.CvtVal; + LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal); + if (SrcTy != S32) { + assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); + CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); + } + + assert(MI.getOpcode() != NewOpc); + B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); + MI.eraseFromParent(); +} + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + const AMDGPULegalizerInfo *LI, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); + + if (Generated.tryCombineAll(Observer, MI, B, Helper)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + return Helper.tryCombineShiftToUnmerge(MI, 32); + } + + return false; +} + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AMDGPUPostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const AMDGPULegalizerInfo *LI + = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); + + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), LI, KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AMDGPUPostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { + return new AMDGPUPostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp new file mode 100644 index 0000000000000..800ad2039f0e9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -0,0 +1,153 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// before the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +#define DEBUG_TYPE "amdgpu-prelegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg); + + if (Generated.tryCombineAll(Observer, MI, B, Helper)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); + } + + return false; +} + +#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPreLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AMDGPUPreLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AMDGPUPreLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs before legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs before legalization", false, + false) + +namespace llvm { +FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { + return new AMDGPUPreLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 511de96b5f7cb..524a34be876ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -218,10 +218,10 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( // if (ArgSize % DWORD_ALIGN != 0) { llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx); - VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType); + auto *LLVMVecType = llvm::dyn_cast<llvm::FixedVectorType>(ArgType); int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1; if (LLVMVecType && NumElem > 1) - ResType = llvm::VectorType::get(ResType, NumElem); + ResType = llvm::FixedVectorType::get(ResType, NumElem); Builder.SetInsertPoint(CI); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); if (OpConvSpecifiers[ArgCount - 1] == 'x' || @@ -387,9 +387,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( Value *id_gep_cast = new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch); - StoreInst *stbuff = - new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast); - stbuff->insertBefore(Brnch); // to Remove unused variable warning + new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch); SmallVector<Value *, 2> FourthIdxList; ConstantInt *fourInt = @@ -408,8 +406,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( Value *Arg = CI->getArgOperand(ArgCount); Type *ArgType = Arg->getType(); SmallVector<Value *, 32> WhatToStore; - if (ArgType->isFPOrFPVectorTy() && - (ArgType->getTypeID() != Type::VectorTyID)) { + if (ArgType->isFPOrFPVectorTy() && !isa<VectorType>(ArgType)) { Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty; if (OpConvSpecifiers[ArgCount - 1] == 'f') { ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg); @@ -478,18 +475,14 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch); WhatToStore.push_back(Arg); } - } else if (ArgType->getTypeID() == Type::VectorTyID) { + } else if (isa<FixedVectorType>(ArgType)) { Type *IType = NULL; - uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements(); + uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements(); uint32_t EleSize = ArgType->getScalarSizeInBits(); uint32_t TotalSize = EleCount * EleSize; if (EleCount == 3) { - IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext()); - Constant *Indices[4] = { - ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1), - ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)}; - Constant *Mask = ConstantVector::get(Indices); - ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask); + ShuffleVectorInst *Shuffle = + new ShuffleVectorInst(Arg, Arg, ArrayRef<int>{0, 1, 2, 2}); Shuffle->insertBefore(Brnch); Arg = Shuffle; ArgType = Arg->getType(); @@ -523,7 +516,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( break; } if (EleCount > 1) { - IType = dyn_cast<Type>(VectorType::get(IType, EleCount)); + IType = FixedVectorType::get(IType, EleCount); } Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch); WhatToStore.push_back(Arg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 14958a180ce3f..727f71b350490 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS( cl::desc("Disable promote alloca to LDS"), cl::init(false)); +static cl::opt<unsigned> PromoteAllocaToVectorLimit( + "amdgpu-promote-alloca-to-vector-limit", + cl::desc("Maximum byte size to consider promote alloca to vector"), + cl::init(0)); + // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { private: @@ -86,6 +91,7 @@ private: // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; + unsigned MaxVGPRs; bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -128,14 +134,42 @@ public: } }; +class AMDGPUPromoteAllocaToVector : public FunctionPass { +private: + unsigned MaxVGPRs; + +public: + static char ID; + + AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "AMDGPU Promote Alloca to vector"; + } + + bool handleAlloca(AllocaInst &I); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; +char AMDGPUPromoteAllocaToVector::ID = 0; INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", + "AMDGPU promote alloca to vector", false, false) + char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; +char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; @@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { if (!ST.isPromoteAllocaEnabled()) return false; + if (IsAMDGCN) { + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergable later anyway. Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); - LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); - LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); @@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { return CI; } -static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) { - return VectorType::get(ArrayTy->getElementType(), - ArrayTy->getNumElements()); +static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { + return FixedVectorType::get(ArrayTy->getElementType(), + ArrayTy->getNumElements()); +} + +static Value *stripBitcasts(Value *V) { + while (Instruction *I = dyn_cast<Instruction>(V)) { + if (I->getOpcode() != Instruction::BitCast) + break; + V = I->getOperand(0); + } + return V; } static Value * calculateVectorIndex(Value *Ptr, const std::map<GetElementPtrInst *, Value *> &GEPIdx) { - GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr)); + if (!GEP) + return nullptr; auto I = GEPIdx.find(GEP); return I == GEPIdx.end() ? nullptr : I->second; @@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. -static bool canVectorizeInst(Instruction *Inst, User *User) { +static bool canVectorizeInst(Instruction *Inst, User *User, + const DataLayout &DL) { switch (Inst->getOpcode()) { case Instruction::Load: { // Currently only handle the case where the Pointer Operand is a GEP. @@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { LI->getPointerOperandType() == User->getType() && isa<VectorType>(LI->getType())) return true; - return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple(); + + Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand()); + if (!PtrInst) + return false; + + return (PtrInst->getOpcode() == Instruction::GetElementPtr || + PtrInst->getOpcode() == Instruction::BitCast) && + LI->isSimple(); } case Instruction::BitCast: return true; @@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { SI->getPointerOperandType() == User->getType() && isa<VectorType>(SI->getValueOperand()->getType())) return true; - return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple(); + + Instruction *UserInst = dyn_cast<Instruction>(User); + if (!UserInst) + return false; + + return (SI->getPointerOperand() == User) && + (UserInst->getOpcode() == Instruction::GetElementPtr || + UserInst->getOpcode() == Instruction::BitCast) && + SI->isSimple(); } default: return false; } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, + unsigned MaxVGPRs) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); return false; } - Type *AT = Alloca->getAllocatedType(); - SequentialType *AllocaTy = dyn_cast<SequentialType>(AT); + Type *AllocaTy = Alloca->getAllocatedType(); + auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); + if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { + if (VectorType::isValidElementType(ArrayTy->getElementType()) && + ArrayTy->getNumElements() > 0) + VectorTy = arrayTypeToVecType(ArrayTy); + } + + // Use up to 1/4 of available register budget for vectorization. + unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 + : (MaxVGPRs * 32); + + if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) { + LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " + << MaxVGPRs << " registers available\n"); + return false; + } LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); @@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { // are just being conservative for now. // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these // could also be promoted but we don't currently handle this case - if (!AllocaTy || - AllocaTy->getNumElements() > 16 || - AllocaTy->getNumElements() < 2 || - !VectorType::isValidElementType(AllocaTy->getElementType())) { + if (!VectorTy || VectorTy->getNumElements() > 16 || + VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } std::map<GetElementPtrInst*, Value*> GEPVectorIdx; - std::vector<Value*> WorkList; - for (User *AllocaUser : Alloca->users()) { + std::vector<Value *> WorkList; + SmallVector<User *, 8> Users(Alloca->users()); + SmallVector<User *, 8> UseUsers(Users.size(), Alloca); + Type *VecEltTy = VectorTy->getElementType(); + while (!Users.empty()) { + User *AllocaUser = Users.pop_back_val(); + User *UseUser = UseUsers.pop_back_val(); + Instruction *Inst = dyn_cast<Instruction>(AllocaUser); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); if (!GEP) { - if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca)) + if (!canVectorizeInst(Inst, UseUser, DL)) return false; + if (Inst->getOpcode() == Instruction::BitCast) { + Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); + Type *ToTy = Inst->getType()->getPointerElementType(); + if (FromTy->isAggregateType() || ToTy->isAggregateType() || + DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) + continue; + + for (User *CastUser : Inst->users()) { + if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser))) + continue; + Users.push_back(CastUser); + UseUsers.push_back(Inst); + } + + continue; + } + WorkList.push_back(AllocaUser); continue; } @@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { } GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser)) - return false; - - WorkList.push_back(GEPUser); - } + Users.append(GEP->user_begin(), GEP->user_end()); + UseUsers.append(GEP->getNumUses(), GEP); } - VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy); - if (!VectorTy) - VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy)); - LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); @@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AT) + if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) break; - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + if (!Index) + break; + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + if (Inst->getType() != VecEltTy) + ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); break; } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(Inst); - if (SI->getValueOperand()->getType() == AT) + if (SI->getValueOperand()->getType() == AllocaTy || + SI->getValueOperand()->getType()->isVectorTy()) break; - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + if (!Index) + break; + + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - SI->getValueOperand(), - Index); + Value *Elt = SI->getValueOperand(); + if (Elt->getType() != VecEltTy) + Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); Builder.CreateStore(NewVecValue, BitCast); Inst->eraseFromParent(); break; } - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - break; default: llvm_unreachable("Inconsistency in instructions promotable to vector"); @@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { continue; if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); // FIXME: Try to account for padding here. The padding is currently // determined from the inverse order of uses in the function. I'm not // sure if the use list order is in any way connected to this, so the // total reported size is likely incorrect. uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment); CurrentLocalMemUsage += AllocSize; break; } @@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; + const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(&I); // First try to replace the alloca with a vector @@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) return true; // Promoted to vector. if (DisablePromoteAllocaToLDS) @@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - const DataLayout &DL = Mod->getDataLayout(); - - unsigned Align = I.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(I.getAllocatedType()); + Align Alignment = + DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // FIXME: It is also possible that if we're allowed to use all of the memory // could could end up using more than the maximum due to alignment padding. - uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); NewSize += AllocSize; @@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { return true; } +bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { + if (skipFunction(F) || DisablePromoteAllocaToVector) + return false; + + const TargetMachine *TM; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + TM = &TPC->getTM<TargetMachine>(); + else + return false; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + if (!ST.isPromoteAllocaEnabled()) + return false; + + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + + bool Changed = false; + BasicBlock &EntryBB = *F.begin(); + + SmallVector<AllocaInst *, 16> Allocas; + for (Instruction &I : EntryBB) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Allocas.push_back(AI); + } + + for (AllocaInst *AI : Allocas) { + if (handleAlloca(*AI)) + Changed = true; + } + + return Changed; +} + +bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) + return false; + + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + Module *Mod = I.getParent()->getParent()->getParent(); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +} + FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } + +FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { + return new AMDGPUPromoteAllocaToVector(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp index 7a7addd0f5cfe..982aae3748849 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -48,19 +48,62 @@ extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; namespace { +// Target features to propagate. +static constexpr const FeatureBitset TargetFeatures = { + AMDGPU::FeatureWavefrontSize16, + AMDGPU::FeatureWavefrontSize32, + AMDGPU::FeatureWavefrontSize64 +}; + +// Attributes to propagate. +static constexpr const char* AttributeNames[] = { + "amdgpu-waves-per-eu" +}; + +static constexpr unsigned NumAttr = + sizeof(AttributeNames) / sizeof(AttributeNames[0]); + class AMDGPUPropagateAttributes { - const FeatureBitset TargetFeatures = { - AMDGPU::FeatureWavefrontSize16, - AMDGPU::FeatureWavefrontSize32, - AMDGPU::FeatureWavefrontSize64 + + class FnProperties { + private: + explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {} + + public: + explicit FnProperties(const TargetMachine &TM, const Function &F) { + Features = TM.getSubtargetImpl(F)->getFeatureBits(); + + for (unsigned I = 0; I < NumAttr; ++I) + if (F.hasFnAttribute(AttributeNames[I])) + Attributes[I] = F.getFnAttribute(AttributeNames[I]); + } + + bool operator == (const FnProperties &Other) const { + if ((Features & TargetFeatures) != (Other.Features & TargetFeatures)) + return false; + for (unsigned I = 0; I < NumAttr; ++I) + if (Attributes[I] != Other.Attributes[I]) + return false; + return true; + } + + FnProperties adjustToCaller(const FnProperties &CallerProps) const { + FnProperties New((Features & ~TargetFeatures) | CallerProps.Features); + for (unsigned I = 0; I < NumAttr; ++I) + New.Attributes[I] = CallerProps.Attributes[I]; + return New; + } + + FeatureBitset Features; + Optional<Attribute> Attributes[NumAttr]; }; - class Clone{ + class Clone { public: - Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) : - FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {} + Clone(const FnProperties &Props, Function *OrigF, Function *NewF) : + Properties(Props), OrigF(OrigF), NewF(NewF) {} - FeatureBitset FeatureMask; + FnProperties Properties; Function *OrigF; Function *NewF; }; @@ -77,17 +120,19 @@ class AMDGPUPropagateAttributes { SmallVector<Clone, 32> Clones; // Find a clone with required features. - Function *findFunction(const FeatureBitset &FeaturesNeeded, + Function *findFunction(const FnProperties &PropsNeeded, Function *OrigF); - // Clone function F and set NewFeatures on the clone. + // Clone function \p F and set \p NewProps on the clone. // Cole takes the name of original function. - Function *cloneWithFeatures(Function &F, - const FeatureBitset &NewFeatures); + Function *cloneWithProperties(Function &F, const FnProperties &NewProps); // Set new function's features in place. void setFeatures(Function &F, const FeatureBitset &NewFeatures); + // Set new function's attributes in place. + void setAttributes(Function &F, const ArrayRef<Optional<Attribute>> NewAttrs); + std::string getFeatureString(const FeatureBitset &Features) const; // Propagate attributes from Roots. @@ -155,11 +200,11 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate, false, false) Function * -AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded, +AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, Function *OrigF) { // TODO: search for clone's clones. for (Clone &C : Clones) - if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask) + if (C.OrigF == OrigF && PropsNeeded == C.Properties) return C.NewF; return nullptr; @@ -192,12 +237,12 @@ bool AMDGPUPropagateAttributes::process() { NewRoots.clear(); for (auto &F : M.functions()) { - if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F)) + if (F.isDeclaration()) continue; - const FeatureBitset &CalleeBits = - TM->getSubtargetImpl(F)->getFeatureBits(); + const FnProperties CalleeProps(*TM, F); SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace; + SmallSet<CallBase *, 32> Visited; for (User *U : F.users()) { Instruction *I = dyn_cast<Instruction>(U); @@ -207,36 +252,36 @@ bool AMDGPUPropagateAttributes::process() { if (!CI) continue; Function *Caller = CI->getCaller(); - if (!Caller) + if (!Caller || !Visited.insert(CI).second) continue; - if (!Roots.count(Caller)) + if (!Roots.count(Caller) && !NewRoots.count(Caller)) continue; - const FeatureBitset &CallerBits = - TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures; + const FnProperties CallerProps(*TM, *Caller); - if (CallerBits == (CalleeBits & TargetFeatures)) { - NewRoots.insert(&F); + if (CalleeProps == CallerProps) { + if (!Roots.count(&F)) + NewRoots.insert(&F); continue; } - Function *NewF = findFunction(CallerBits, &F); + Function *NewF = findFunction(CallerProps, &F); if (!NewF) { - FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) | - CallerBits); + const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps); if (!AllowClone) { // This may set different features on different iteartions if // there is a contradiction in callers' attributes. In this case // we rely on a second pass running on Module, which is allowed // to clone. - setFeatures(F, NewFeatures); + setFeatures(F, NewProps.Features); + setAttributes(F, NewProps.Attributes); NewRoots.insert(&F); Changed = true; break; } - NewF = cloneWithFeatures(F, NewFeatures); - Clones.push_back(Clone(CallerBits, &F, NewF)); + NewF = cloneWithProperties(F, NewProps); + Clones.push_back(Clone(CallerProps, &F, NewF)); NewRoots.insert(NewF); } @@ -258,28 +303,30 @@ bool AMDGPUPropagateAttributes::process() { F->eraseFromParent(); } + Roots.clear(); + Clones.clear(); + return Changed; } Function * -AMDGPUPropagateAttributes::cloneWithFeatures(Function &F, - const FeatureBitset &NewFeatures) { +AMDGPUPropagateAttributes::cloneWithProperties(Function &F, + const FnProperties &NewProps) { LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); ValueToValueMapTy dummy; Function *NewF = CloneFunction(&F, dummy); - setFeatures(*NewF, NewFeatures); + setFeatures(*NewF, NewProps.Features); + setAttributes(*NewF, NewProps.Attributes); + NewF->setVisibility(GlobalValue::DefaultVisibility); + NewF->setLinkage(GlobalValue::InternalLinkage); // Swap names. If that is the only clone it will retain the name of now - // dead value. - if (F.hasName()) { - std::string NewName = NewF->getName(); + // dead value. Preserve original name for externally visible functions. + if (F.hasName() && F.hasLocalLinkage()) { + std::string NewName = std::string(NewF->getName()); NewF->takeName(&F); F.setName(NewName); - - // Name has changed, it does not need an external symbol. - F.setVisibility(GlobalValue::DefaultVisibility); - F.setLinkage(GlobalValue::InternalLinkage); } return NewF; @@ -297,6 +344,18 @@ void AMDGPUPropagateAttributes::setFeatures(Function &F, F.addFnAttr("target-features", NewFeatureStr); } +void AMDGPUPropagateAttributes::setAttributes(Function &F, + const ArrayRef<Optional<Attribute>> NewAttrs) { + LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n"); + for (unsigned I = 0; I < NumAttr; ++I) { + F.removeFnAttr(AttributeNames[I]); + if (NewAttrs[I]) { + LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n'); + F.addFnAttr(*NewAttrs[I]); + } + } +} + std::string AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp new file mode 100644 index 0000000000000..71d82679b3ff1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -0,0 +1,154 @@ +//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// after register banks are known. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +#define DEBUG_TYPE "amdgpu-regbank-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + + +#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AMDGPUGenRegBankGICombiner.inc" +#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AMDGPUGenRegBankGICombiner.inc" +#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H + +class AMDGPURegBankCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg; + + AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + const AMDGPULegalizerInfo *LI, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + + if (Generated.tryCombineAll(Observer, MI, B, Helper)) + return true; + + return false; +} + +#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AMDGPUGenRegBankGICombiner.inc" +#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AMDGPURegBankCombiner : public MachineFunctionPass { +public: + static char ID; + + AMDGPURegBankCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AMDGPURegBankCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const AMDGPULegalizerInfo *LI + = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); + + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), LI, KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AMDGPURegBankCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after regbankselect", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after regbankselect", false, + false) + +namespace llvm { +FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) { + return new AMDGPURegBankCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1bb01dc8fa112..dfaf97bfb08e7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -8,10 +8,69 @@ /// \file /// This file implements the targeting of the RegisterBankInfo class for /// AMDGPU. -/// \todo This should be generated by TableGen. +/// +/// \par +/// +/// AMDGPU has unique register bank constraints that require special high level +/// strategies to deal with. There are two main true physical register banks +/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a +/// sort of pseudo-register bank needed to represent SGPRs used in a vector +/// boolean context. There is also the AGPR bank, which is a special purpose +/// physical register bank present on some subtargets. +/// +/// Copying from VGPR to SGPR is generally illegal, unless the value is known to +/// be uniform. It is generally not valid to legalize operands by inserting +/// copies as on other targets. Operations which require uniform, SGPR operands +/// generally require scalarization by repeatedly executing the instruction, +/// activating each set of lanes using a unique set of input values. This is +/// referred to as a waterfall loop. +/// +/// \par Booleans +/// +/// Booleans (s1 values) requires special consideration. A vector compare result +/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit +/// register. These are represented with the VCC bank. During selection, we need +/// to be able to unambiguously go back from a register class to a register +/// bank. To distinguish whether an SGPR should use the SGPR or VCC register +/// bank, we need to know the use context type. An SGPR s1 value always means a +/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets +/// SCC, which is a 1-bit unaddressable register. This will need to be copied to +/// a 32-bit virtual register. Taken together, this means we need to adjust the +/// type of boolean operations to be regbank legal. All SALU booleans need to be +/// widened to 32-bits, and all VALU booleans need to be s1 values. +/// +/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact +/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc +/// bank. A non-boolean source (such as a truncate from a 1-bit load from +/// memory) will require a copy to the VCC bank which will require clearing the +/// high bits and inserting a compare. +/// +/// \par Constant bus restriction +/// +/// VALU instructions have a limitation known as the constant bus +/// restriction. Most VALU instructions can use SGPR operands, but may read at +/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most +/// instructions). This is one unique SGPR, so the same SGPR may be used for +/// multiple operands. From a register bank perspective, any combination of +/// operands should be legal as an SGPR, but this is contextually dependent on +/// the SGPR operands all being the same register. There is therefore optimal to +/// choose the SGPR with the most uses to minimize the number of copies. +/// +/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* +/// operation should have its source operands all mapped to VGPRs (except for +/// VCC), inserting copies from any SGPR operands. This the most trival legal +/// mapping. Anything beyond the simplest 1:1 instruction selection would be too +/// complicated to solve here. Every optimization pattern or instruction +/// selected to multiple outputs would have to enforce this rule, and there +/// would be additional complexity in tracking this rule for every G_* +/// operation. By forcing all inputs to VGPRs, it also simplifies the task of +/// picking the optimal operand combination from a post-isel optimization pass. +/// //===----------------------------------------------------------------------===// #include "AMDGPURegisterBankInfo.h" + +#include "AMDGPUGlobalISelUtils.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -19,8 +78,8 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -101,8 +160,9 @@ public: if (!Op.isReg()) continue; + // We may see physical registers if building a real MI Register Reg = Op.getReg(); - if (MRI.getRegClassOrRegBank(Reg)) + if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) continue; const RegisterBank *RB = NewBank; @@ -138,15 +198,16 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) TII(Subtarget.getInstrInfo()) { // HACK: Until this is fully tablegen'd. - static bool AlreadyInit = false; - if (AlreadyInit) - return; + static llvm::once_flag InitializeRegisterBankFlag; - AlreadyInit = true; + static auto InitializeRegisterBankOnce = [this]() { + assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && + &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && + &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); + (void)this; + }; - assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && - &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && - &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); + llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); } static bool isVectorRegisterBank(const RegisterBank &Bank) { @@ -159,7 +220,7 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, unsigned Size) const { // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? if (Dst.getID() == AMDGPU::SGPRRegBankID && - isVectorRegisterBank(Src)) { + (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { return std::numeric_limits<unsigned>::max(); } @@ -177,9 +238,6 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, Src.getID() == AMDGPU::VCCRegBankID)) return std::numeric_limits<unsigned>::max(); - if (Src.getID() == AMDGPU::VCCRegBankID) - return std::numeric_limits<unsigned>::max(); - // There is no direct copy between AGPRs. if (Dst.getID() == AMDGPU::AGPRRegBankID && Src.getID() == AMDGPU::AGPRRegBankID) @@ -317,22 +375,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( const MachineInstr &MI, const MachineRegisterInfo &MRI) const { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_buffer_load: { - static const OpRegBankEntry<3> Table[4] = { - // Perfectly legal. - { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, - { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, - - // Waterfall loop needed for rsrc. In the worst case this will execute - // approximately an extra 10 * wavesize + 2 instructions. - { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, - { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 } - }; - - // rsrc, voffset, offset - const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } }; - return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); - } case Intrinsic::amdgcn_s_buffer_load: { static const OpRegBankEntry<2> Table[4] = { // Perfectly legal. @@ -402,15 +444,15 @@ static bool isScalarLoadLegal(const MachineInstr &MI) { AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. - return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && - // Can't do a scalar atomic load. - !MMO->isAtomic() && - // Don't use scalar loads for volatile accesses to non-constant address - // spaces. - (IsConst || !MMO->isVolatile()) && - // Memory must be known constant, or not written before this load. - (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && - AMDGPUInstrInfo::isUniformMMO(MMO); + return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) && + // Can't do a scalar atomic load. + !MMO->isAtomic() && + // Don't use scalar loads for volatile accesses to non-constant address + // spaces. + (IsConst || !MMO->isVolatile()) && + // Memory must be known constant, or not written before this load. + (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && + AMDGPUInstrInfo::isUniformMMO(MMO); } RegisterBankInfo::InstructionMappings @@ -490,24 +532,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 3); // Num Operands AltMappings.push_back(&VVMapping); - - const InstructionMapping &SVMapping = getInstructionMapping( - 3, 3, getOperandsMapping( - {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), - 3); // Num Operands - AltMappings.push_back(&SVMapping); - - // SGPR in LHS is slightly preferrable, so make it VS more expensive than - // SV. - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 4, getOperandsMapping( - {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}), - 3); // Num Operands - AltMappings.push_back(&VSMapping); break; } case TargetOpcode::G_LOAD: @@ -517,7 +541,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); unsigned PtrSize = PtrTy.getSizeInBits(); unsigned AS = PtrTy.getAddressSpace(); - LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && AS != AMDGPUAS::PRIVATE_ADDRESS) && @@ -531,9 +554,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( } const InstructionMapping &VVMapping = getInstructionMapping( - 2, 1, getOperandsMapping( - {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), + 2, 1, + getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 2); // Num Operands AltMappings.push_back(&VVMapping); @@ -546,43 +570,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } - case TargetOpcode::G_ICMP: { - // TODO: Should report 32-bit for scalar output type. - unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); - const InstructionMapping &SSMapping = getInstructionMapping(1, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), - nullptr, // Predicate operand. - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), - 4); // Num Operands - AltMappings.push_back(&SSMapping); - - const InstructionMapping &SVMapping = getInstructionMapping(2, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), - nullptr, // Predicate operand. - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), - 4); // Num Operands - AltMappings.push_back(&SVMapping); - - const InstructionMapping &VSMapping = getInstructionMapping(3, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), - nullptr, // Predicate operand. - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), - 4); // Num Operands - AltMappings.push_back(&VSMapping); - - const InstructionMapping &VVMapping = getInstructionMapping(4, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), - nullptr, // Predicate operand. - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), - 4); // Num Operands - AltMappings.push_back(&VVMapping); - - return AltMappings; - } case TargetOpcode::G_SELECT: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); const InstructionMapping &SSMapping = getInstructionMapping(1, 1, @@ -607,10 +594,8 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( case TargetOpcode::G_SMAX: case TargetOpcode::G_UMIN: case TargetOpcode::G_UMAX: { - static const OpRegBankEntry<3> Table[4] = { + static const OpRegBankEntry<3> Table[2] = { { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, - { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, - { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, // Scalar requires cmp+select, and extends if 16-bit. // FIXME: Should there be separate costs for 32 and 16-bit @@ -740,6 +725,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( SmallVector<Register, 4> InitResultRegs; SmallVector<Register, 4> PhiRegs; + // Track use registers which have already been expanded with a readfirstlane + // sequence. This may have multiple uses if moving a sequence. + DenseMap<Register, Register> WaterfalledRegMap; + MachineBasicBlock &MBB = B.getMBB(); MachineFunction *MF = &B.getMF(); @@ -755,6 +744,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const unsigned ExecReg = Subtarget.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +#ifndef NDEBUG + const int OrigRangeSize = std::distance(Range.begin(), Range.end()); +#endif + for (MachineInstr &MI : Range) { for (MachineOperand &Def : MI.defs()) { LLT ResTy = MRI.getType(Def.getReg()); @@ -820,13 +813,14 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const DebugLoc &DL = B.getDL(); - // Figure out the iterator range after splicing the instructions. - auto NewBegin = std::prev(LoopBB->end()); + MachineInstr &FirstInst = *Range.begin(); // Move the instruction into the loop. Note we moved everything after // Range.end() already into a new block, so Range.end() is no longer valid. LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + // Figure out the iterator range after splicing the instructions. + MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); auto NewEnd = LoopBB->end(); MachineBasicBlock::iterator I = Range.begin(); @@ -834,39 +828,145 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( Register CondReg; + assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { for (MachineOperand &Op : MI.uses()) { if (!Op.isReg() || Op.isDef()) continue; - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); + Register OldReg = Op.getReg(); + if (!SGPROperandRegs.count(OldReg)) + continue; + + // See if we already processed this register in another instruction in the + // sequence. + auto OldVal = WaterfalledRegMap.find(OldReg); + if (OldVal != WaterfalledRegMap.end()) { + Op.setReg(OldVal->second); + continue; + } + + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); + + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); + + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); + + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); + + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector<Register, 8> ReadlanePieces; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + bool Is64 = OpSize % 64 == 0; + + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + // Insert the unmerge before the loop. + + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); + + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); + + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(Op.getReg()); + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } + } else { + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); + } Register NewCondReg = MRI.createVirtualRegister(WaveRC); bool First = CondReg == AMDGPU::NoRegister; if (First) CondReg = NewCondReg; - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + B.buildInstr(CmpOp) .addDef(NewCondReg) .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); + .addReg(UnmergePiece); if (!First) { Register AndReg = MRI.createVirtualRegister(WaveRC); @@ -878,114 +978,23 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addReg(CondReg); CondReg = AndReg; } - } else { - LLT S32 = LLT::scalar(32); - SmallVector<Register, 8> ReadlanePieces; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - bool Is64 = OpSize % 64 == 0; - - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - // Insert the unmerge before the loop. - - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); - - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - Register UnmergePiece = Unmerge.getReg(PieceIdx); - - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); - - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); - - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); - - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); - - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); - } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); - } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } - - Register NewCondReg = MRI.createVirtualRegister(WaveRC); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - B.buildInstr(CmpOp) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(WaveRC); - - // If there are multiple operands to consider, and the conditions. - B.buildInstr(WaveAndOpc) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } - - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } + } - MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); } + + MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); } + + // Make sure we don't re-process this register again. + WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); } } @@ -1093,53 +1102,89 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( MI.getOperand(OpIdx).setReg(SGPR); } -// When regbankselect repairs registers, it will insert a repair instruction -// which defines the repaired register. Then it calls applyMapping and expects -// that the targets will either delete or rewrite the originally wrote to the -// repaired registers. Beccause of this, we end up in a situation where -// we have 2 instructions defining the same registers. -static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, - Register Reg, - const MachineInstr &MI) { - // Is there some way we can assert that there are exactly 2 def instructions? - for (MachineInstr &Other : MRI.def_instructions(Reg)) { - if (&Other != &MI) - return &Other; - } - - return nullptr; +/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the +/// rest will be in the remainder. +static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { + unsigned TotalSize = Ty.getSizeInBits(); + if (!Ty.isVector()) + return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; + + LLT EltTy = Ty.getElementType(); + unsigned EltSize = EltTy.getSizeInBits(); + assert(FirstSize % EltSize == 0); + + unsigned FirstPartNumElts = FirstSize / EltSize; + unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; + + return {LLT::scalarOrVector(FirstPartNumElts, EltTy), + LLT::scalarOrVector(RemainderElts, EltTy)}; +} + +static LLT widen96To128(LLT Ty) { + if (!Ty.isVector()) + return LLT::scalar(128); + + LLT EltTy = Ty.getElementType(); + assert(128 % EltTy.getSizeInBits() == 0); + return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); } -bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, +bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const { Register DstReg = MI.getOperand(0).getReg(); - const LLT LoadTy = MRI.getType(DstReg); + const LLT LoadTy = MRI.getType(DstReg); unsigned LoadSize = LoadTy.getSizeInBits(); const unsigned MaxNonSmrdLoadSize = 128; + + const RegisterBank *PtrBank = + OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; + if (PtrBank == &AMDGPU::SGPRRegBank) { + // If the pointer is an SGPR, we ordinarily have nothing to do. + if (LoadSize != 96) + return false; + + MachineMemOperand *MMO = *MI.memoperands_begin(); + Register PtrReg = MI.getOperand(1).getReg(); + // 96-bit loads are only available for vector loads. We need to split this + // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). + + MachineIRBuilder B(MI); + ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + + if (MMO->getAlign() < Align(16)) { + LLT Part64, Part32; + std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); + auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); + auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); + + auto Undef = B.buildUndef(LoadTy); + auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); + B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + } else { + LLT WiderTy = widen96To128(LoadTy); + auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); + B.buildExtract(MI.getOperand(0), WideLoad, 0); + } + + MI.eraseFromParent(); + return true; + } + // 128-bit loads are supported for all instruction types. if (LoadSize <= MaxNonSmrdLoadSize) return false; - SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); - SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); + SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); - // If the pointer is an SGPR, we have nothing to do. - if (SrcRegs.empty()) { - const RegisterBank *PtrBank = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - if (PtrBank == &AMDGPU::SGPRRegBank) - return false; + if (SrcRegs.empty()) SrcRegs.push_back(MI.getOperand(1).getReg()); - } assert(LoadSize % MaxNonSmrdLoadSize == 0); - // We want to get the repair instruction now, because it will help us - // determine which instruction the legalizer inserts that will also - // write to DstReg. - MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); - // RegBankSelect only emits scalar types, so we need to reset the pointer // operand to a pointer type. Register BasePtrReg = SrcRegs[0]; @@ -1148,38 +1193,72 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, MachineIRBuilder B(MI); - unsigned SplitElts = - MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); - const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); + unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; + const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); GISelObserverWrapper Observer(&O); B.setChangeObserver(Observer); LegalizerHelper Helper(B.getMF(), Observer, B); - if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + + if (LoadTy.isVector()) { + if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + } else { + if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + } + + MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); + return true; +} + +bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( + MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + const MachineFunction &MF = *MI.getMF(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const auto &TFI = *ST.getFrameLowering(); + + // Guard in case the stack growth direction ever changes with scratch + // instructions. + if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) return false; - // At this point, the legalizer has split the original load into smaller - // loads. At the end of lowering, it inserts an instruction (LegalizedInst) - // that combines the outputs of the lower loads and writes it to DstReg. - // The register bank selector has also added the RepairInst which writes to - // DstReg as well. + Register Dst = MI.getOperand(0).getReg(); + Register AllocSize = MI.getOperand(1).getReg(); + Align Alignment = assumeAligned(MI.getOperand(2).getImm()); - MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); + const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); - // Replace the output of the LegalizedInst with a temporary register, since - // RepairInst already defines DstReg. - Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); - LegalizedInst->getOperand(0).setReg(TmpReg); - B.setInsertPt(*RepairInst->getParent(), RepairInst); + // TODO: Need to emit a wave reduction to get the maximum size. + if (SizeBank != &AMDGPU::SGPRRegBank) + return false; - for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { - Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - B.buildConstant(IdxReg, DefIdx); - MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank); - B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); + LLT PtrTy = MRI.getType(Dst); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + Register SPReg = Info->getStackPtrOffsetReg(); + ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplyBank); + + MachineIRBuilder B(MI); + B.setChangeObserver(Observer); + + auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); + auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); + + auto SPCopy = B.buildCopy(PtrTy, SPReg); + if (Alignment > TFI.getStackAlign()) { + auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); + B.buildMaskLowPtrBits(Dst, PtrAdd, + Log2(Alignment) + ST.getWavefrontSizeLog2()); + } else { + B.buildPtrAdd(Dst, SPCopy, ScaledSize); } - MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); + MI.eraseFromParent(); return true; } @@ -1210,6 +1289,281 @@ bool AMDGPURegisterBankInfo::applyMappingImage( return true; } +static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, + Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return Reg; + + // TODO: Guard against this being an implicit def + return Def->getOperand(0).getReg(); +} + +// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store +// the three offsets (voffset, soffset and instoffset) +static unsigned setBufferOffsets(MachineIRBuilder &B, + const AMDGPURegisterBankInfo &RBI, + Register CombinedOffset, Register &VOffsetReg, + Register &SOffsetReg, int64_t &InstOffsetVal, + Align Alignment) { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo *MRI = B.getMRI(); + + if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) { + uint32_t SOffset, ImmOffset; + if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, + Alignment)) { + VOffsetReg = B.buildConstant(S32, 0).getReg(0); + SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); + InstOffsetVal = ImmOffset; + + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + return SOffset + ImmOffset; + } + } + + Register Base; + unsigned Offset; + MachineInstr *Unused; + + std::tie(Base, Offset, Unused) + = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); + + uint32_t SOffset, ImmOffset; + if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + &RBI.Subtarget, Alignment)) { + if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + VOffsetReg = Base; + SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + InstOffsetVal = ImmOffset; + return 0; // XXX - Why is this 0? + } + + // If we have SGPR base, we can use it for soffset. + if (SOffset == 0) { + VOffsetReg = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + SOffsetReg = Base; + InstOffsetVal = ImmOffset; + return 0; // XXX - Why is this 0? + } + } + + // Handle the variable sgpr + vgpr case. + if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { + Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); + Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); + + const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); + + if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { + VOffsetReg = Src0; + SOffsetReg = Src1; + return 0; + } + + if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { + VOffsetReg = Src1; + SOffsetReg = Src0; + return 0; + } + } + + // Ensure we have a VGPR for the combined offset. This could be an issue if we + // have an SGPR offset and a VGPR resource. + if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + VOffsetReg = CombinedOffset; + } else { + VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + } + + SOffsetReg = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + return 0; +} + +bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + const LLT S32 = LLT::scalar(32); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + const RegisterBank *RSrcBank = + OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; + const RegisterBank *OffsetBank = + OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + if (RSrcBank == &AMDGPU::SGPRRegBank && + OffsetBank == &AMDGPU::SGPRRegBank) + return true; // Legal mapping + + // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back + // here but don't have an MMO. + + unsigned LoadSize = Ty.getSizeInBits(); + int NumLoads = 1; + if (LoadSize == 256 || LoadSize == 512) { + NumLoads = LoadSize / 128; + Ty = Ty.divide(NumLoads); + } + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); + + MachineIRBuilder B(MI); + MachineFunction &MF = B.getMF(); + + Register SOffset; + Register VOffset; + int64_t ImmOffset = 0; + + unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), + VOffset, SOffset, ImmOffset, Alignment); + + // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we + // can, but we neeed to track an MMO for that. + const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; + const Align MemAlign(4); // FIXME: ABI type alignment? + MachineMemOperand *BaseMMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + MemSize, MemAlign); + if (MMOOffset != 0) + BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); + + // If only the offset is divergent, emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + + Register RSrc = MI.getOperand(1).getReg(); + Register VIndex = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); + + SmallVector<Register, 4> LoadParts(NumLoads); + + MachineBasicBlock::iterator MII = MI.getIterator(); + MachineInstrSpan Span(MII, &B.getMBB()); + + for (int i = 0; i < NumLoads; ++i) { + if (NumLoads == 1) { + LoadParts[i] = Dst; + } else { + LoadParts[i] = MRI.createGenericVirtualRegister(Ty); + MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); + } + + MachineMemOperand *MMO = BaseMMO; + if (i != 0) + BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); + + B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) + .addDef(LoadParts[i]) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset + 16 * i) // offset(imm) + .addImm(0) // cachepolicy, swizzled buffer(imm) + .addImm(0) // idxen(imm) + .addMemOperand(MMO); + } + + // TODO: If only the resource is a VGPR, it may be better to execute the + // scalar load in the waterfall loop if the resource is expected to frequently + // be dynamically uniform. + if (RSrcBank != &AMDGPU::SGPRRegBank) { + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + SmallSet<Register, 4> OpsToWaterfall; + + OpsToWaterfall.insert(RSrc); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + } + + if (NumLoads != 1) { + if (Ty.isVector()) + B.buildConcatVectors(Dst, LoadParts); + else + B.buildMerge(Dst, LoadParts); + } + + // We removed the instruction earlier with a waterfall loop. + if (RSrcBank == &AMDGPU::SGPRRegBank) + MI.eraseFromParent(); + + return true; +} + +bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( + const OperandsMapper &OpdMapper, bool Signed) const { + MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + // Insert basic copies + applyDefaultMapping(OpdMapper); + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(DstReg); + + const LLT S32 = LLT::scalar(32); + + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::VGPRRegBank) { + if (Ty == S32) + return true; + + // TODO: 64-bit version is scalar only, so we need to expand this. + return false; + } + + Register SrcReg = MI.getOperand(2).getReg(); + Register OffsetReg = MI.getOperand(3).getReg(); + Register WidthReg = MI.getOperand(4).getReg(); + + // The scalar form packs the offset and width in a single operand. + + ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplyBank); + MachineIRBuilder B(MI); + B.setChangeObserver(Observer); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + // TODO: It might be worth using a pseudo here to avoid scc clobber and + // register class constraints. + unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : + (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) + llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + // FIXME: Duplicated from LegalizerHelper static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { @@ -1226,6 +1580,51 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) { } } +static unsigned minMaxToExtend(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_SMIN: + case TargetOpcode::G_SMAX: + return TargetOpcode::G_SEXT; + case TargetOpcode::G_UMIN: + case TargetOpcode::G_UMAX: + return TargetOpcode::G_ZEXT; + default: + llvm_unreachable("not in integer min/max"); + } +} + +// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding +// any illegal vector extend or unmerge operations. +static std::pair<Register, Register> +unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { + const LLT S32 = LLT::scalar(32); + auto Bitcast = B.buildBitcast(S32, Src); + + if (ExtOpcode == TargetOpcode::G_SEXT) { + auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); + auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); + return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); + } + + auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); + if (ExtOpcode == TargetOpcode::G_ZEXT) { + auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); + return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); + } + + assert(ExtOpcode == TargetOpcode::G_ANYEXT); + return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); +} + +static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B, + CmpInst::Predicate Pred, + Register Dst, Register Src0, + Register Src1) { + const LLT CmpType = LLT::scalar(32); + auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); + return B.buildSelect(Dst, Cmp, Src0, Src1); +} + // FIXME: Duplicated from LegalizerHelper, except changing the boolean type. void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const { @@ -1234,24 +1633,25 @@ void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, Register Src1 = MI.getOperand(2).getReg(); const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); - LLT CmpType = LLT::scalar(32); - - auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); - B.buildSelect(Dst, Cmp, Src0, Src1); + MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1); - B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank); + Register CmpReg = Sel->getOperand(1).getReg(); + B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank); MI.eraseFromParent(); } // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand -static void substituteSimpleCopyRegs( +static bool substituteSimpleCopyRegs( const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); if (!SrcReg.empty()) { assert(SrcReg.size() == 1); OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); + return true; } + + return false; } /// Handle register layout difference for f16 images for some subtargets. @@ -1465,6 +1865,223 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); } +/// Utility function for pushing dynamic vector indexes with a constant offset +/// into waterwall loops. +static void reinsertVectorIndexAdd(MachineIRBuilder &B, + MachineInstr &IdxUseInstr, + unsigned OpIdx, + unsigned ConstOffset) { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT S32 = LLT::scalar(32); + Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); + B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); + + auto MaterializedOffset = B.buildConstant(S32, ConstOffset); + + auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); + MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); + IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); +} + +/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the +/// original 32-bit source value (to be inserted in the low part of the combined +/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit +/// value. +static void extendLow32IntoHigh32(MachineIRBuilder &B, + Register Hi32Reg, Register Lo32Reg, + unsigned ExtOpc, + const RegisterBank &RegBank, + bool IsBooleanSrc = false) { + if (ExtOpc == AMDGPU::G_ZEXT) { + B.buildConstant(Hi32Reg, 0); + } else if (ExtOpc == AMDGPU::G_SEXT) { + if (IsBooleanSrc) { + // If we know the original source was an s1, the high half is the same as + // the low. + B.buildCopy(Hi32Reg, Lo32Reg); + } else { + // Replicate sign bit from 32-bit extended part. + auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); + B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); + B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); + } + } else { + assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); + B.buildUndef(Hi32Reg); + } +} + +bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( + MachineInstr &MI, MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const { + + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + + const RegisterBank &IdxBank = + *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + + bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; + + LLT VecTy = MRI.getType(VecReg); + unsigned EltSize = VecTy.getScalarSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + + if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, + IsDivergentIdx)) + return false; + + MachineIRBuilder B(MI); + LLT S32 = LLT::scalar(32); + + const RegisterBank &DstBank = + *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + const RegisterBank &SrcBank = + *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; + + const RegisterBank &CCBank = + (DstBank == AMDGPU::SGPRRegBank && + SrcBank == AMDGPU::SGPRRegBank && + IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank + : AMDGPU::VCCRegBank; + LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); + + if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { + Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); + MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); + } + + LLT EltTy = VecTy.getScalarType(); + SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); + unsigned NumLanes = DstRegs.size(); + if (!NumLanes) + NumLanes = 1; + else + EltTy = MRI.getType(DstRegs[0]); + + auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); + SmallVector<Register, 2> Res(NumLanes); + for (unsigned L = 0; L < NumLanes; ++L) + Res[L] = UnmergeToEltTy.getReg(L); + + for (unsigned I = 1; I < NumElem; ++I) { + auto IC = B.buildConstant(S32, I); + MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); + auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); + MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); + + for (unsigned L = 0; L < NumLanes; ++L) { + auto S = B.buildSelect(EltTy, Cmp, + UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); + + for (unsigned N : { 0, 2, 3 }) + MRI.setRegBank(S->getOperand(N).getReg(), DstBank); + + Res[L] = S->getOperand(0).getReg(); + } + } + + for (unsigned L = 0; L < NumLanes; ++L) { + Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; + B.buildCopy(DstReg, Res[L]); + MRI.setRegBank(DstReg, DstBank); + } + + MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); + MI.eraseFromParent(); + + return true; +} + +bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( + MachineInstr &MI, MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const { + + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(3).getReg(); + + const RegisterBank &IdxBank = + *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + + bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; + + LLT VecTy = MRI.getType(VecReg); + unsigned EltSize = VecTy.getScalarSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + + if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, + IsDivergentIdx)) + return false; + + MachineIRBuilder B(MI); + LLT S32 = LLT::scalar(32); + + const RegisterBank &DstBank = + *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + const RegisterBank &SrcBank = + *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; + const RegisterBank &InsBank = + *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + + const RegisterBank &CCBank = + (DstBank == AMDGPU::SGPRRegBank && + SrcBank == AMDGPU::SGPRRegBank && + InsBank == AMDGPU::SGPRRegBank && + IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank + : AMDGPU::VCCRegBank; + LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); + + if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { + Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); + MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); + } + + LLT EltTy = VecTy.getScalarType(); + SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); + unsigned NumLanes = InsRegs.size(); + if (!NumLanes) { + NumLanes = 1; + InsRegs.push_back(MI.getOperand(2).getReg()); + } else { + EltTy = MRI.getType(InsRegs[0]); + } + + auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); + SmallVector<Register, 16> Ops(NumElem * NumLanes); + + for (unsigned I = 0; I < NumElem; ++I) { + auto IC = B.buildConstant(S32, I); + MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); + auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); + MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); + + for (unsigned L = 0; L < NumLanes; ++L) { + auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], + UnmergeToEltTy.getReg(I * NumLanes + L)); + + for (unsigned N : { 0, 2, 3 }) + MRI.setRegBank(S->getOperand(N).getReg(), DstBank); + + Ops[I * NumLanes + L] = S->getOperand(0).getReg(); + } + } + + LLT MergeTy = LLT::vector(Ops.size(), EltTy); + if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { + B.buildBuildVector(MI.getOperand(0), Ops); + } else { + auto Vec = B.buildBuildVector(MergeTy, Ops); + MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); + B.buildBitcast(MI.getOperand(0).getReg(), Vec); + } + + MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); + MI.eraseFromParent(); + + return true; +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -1555,7 +2172,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MachineBasicBlock *MBB = MI.getParent(); B.setInsertPt(*MBB, std::next(MI.getIterator())); - B.buildTrunc(DstReg, NewDstReg); + + // If we had a constrained VCC result register, a copy was inserted to VCC + // from SGPR. + SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); + if (DefRegs.empty()) + DefRegs.push_back(DstReg); + B.buildTrunc(DefRegs[0], NewDstReg); return; } case AMDGPU::G_SELECT: { @@ -1712,10 +2335,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } case AMDGPU::G_ADD: case AMDGPU::G_SUB: - case AMDGPU::G_MUL: { + case AMDGPU::G_MUL: + case AMDGPU::G_SHL: + case AMDGPU::G_LSHR: + case AMDGPU::G_ASHR: { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - if (DstTy != LLT::scalar(16)) + + // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. + // Packed 16-bit operations need to be scalarized and promoted. + if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) break; const RegisterBank *DstBank = @@ -1723,16 +2352,42 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstBank == &AMDGPU::VGPRRegBank) break; - // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. - MachineFunction *MF = MI.getParent()->getParent(); + const LLT S32 = LLT::scalar(32); + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); MachineIRBuilder B(MI); ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); - LegalizerHelper Helper(*MF, Observer, B); - if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != - LegalizerHelper::Legalized) - llvm_unreachable("widen scalar should have succeeded"); + if (DstTy.isVector()) { + B.setChangeObserver(Observer); + + Register WideSrc0Lo, WideSrc0Hi; + Register WideSrc1Lo, WideSrc1Hi; + + std::tie(WideSrc0Lo, WideSrc0Hi) + = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT); + std::tie(WideSrc1Lo, WideSrc1Hi) + = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT); + auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); + auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); + B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); + MI.eraseFromParent(); + } else { + LegalizerHelper Helper(*MF, Observer, B); + + if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + + // FIXME: s16 shift amounts should be legal. + if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || + Opc == AMDGPU::G_ASHR) { + B.setInsertPt(*MBB, MI.getIterator()); + if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) + llvm_unreachable("widen scalar should have succeeded"); + } + } + return; } case AMDGPU::G_SMIN: @@ -1750,10 +2405,44 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // Turn scalar min/max into a compare and select. LLT Ty = MRI.getType(DstReg); - LLT S32 = LLT::scalar(32); - LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); - if (Ty == S16) { + if (Ty == V2S16) { + ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplySALU); + B.setChangeObserver(Observer); + + // Need to widen to s32, and expand as cmp + select, and avoid producing + // illegal vector extends or unmerges that would need further + // legalization. + // + // TODO: Should we just readfirstlane? That should probably be handled + // with a UniformVGPR register bank that wouldn't need special + // consideration here. + + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + + Register WideSrc0Lo, WideSrc0Hi; + Register WideSrc1Lo, WideSrc1Hi; + + unsigned ExtendOp = minMaxToExtend(MI.getOpcode()); + + std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp); + std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp); + + Register Lo = MRI.createGenericVirtualRegister(S32); + Register Hi = MRI.createGenericVirtualRegister(S32); + const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); + buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo); + buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi); + + B.buildBuildVectorTrunc(Dst, {Lo, Hi}); + MI.eraseFromParent(); + } else if (Ty == S16) { ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); @@ -1769,11 +2458,77 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } + case AMDGPU::G_SEXT_INREG: { + SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); + if (SrcRegs.empty()) + break; // Nothing to repair + + const LLT S32 = LLT::scalar(32); + MachineIRBuilder B(MI); + ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + + // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs + // we would need to further expand, and doesn't let us directly set the + // result registers. + SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); + + int Amt = MI.getOperand(2).getImm(); + if (Amt <= 32) { + if (Amt == 32) { + // The low bits are unchanged. + B.buildCopy(DstRegs[0], SrcRegs[0]); + } else { + // Extend in the low bits and propagate the sign bit to the high half. + B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); + } + + B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); + } else { + // The low bits are unchanged, and extend in the high bits. + B.buildCopy(DstRegs[0], SrcRegs[0]); + B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); + } + + Register DstReg = MI.getOperand(0).getReg(); + MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); + MI.eraseFromParent(); + return; + } + case AMDGPU::G_CTPOP: + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: { + MachineIRBuilder B(MI); + MachineFunction &MF = B.getMF(); + + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) + break; + + Register SrcReg = MI.getOperand(1).getReg(); + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(SrcReg); + if (Ty == S32) + break; + + ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&ApplyVALU); + LegalizerHelper Helper(MF, Observer, B); + + if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) + llvm_unreachable("narrowScalar should have succeeded"); + return; + } case AMDGPU::G_SEXT: - case AMDGPU::G_ZEXT: { + case AMDGPU::G_ZEXT: + case AMDGPU::G_ANYEXT: { Register SrcReg = MI.getOperand(1).getReg(); LLT SrcTy = MRI.getType(SrcReg); - bool Signed = Opc == AMDGPU::G_SEXT; + const bool Signed = Opc == AMDGPU::G_SEXT; + + assert(empty(OpdMapper.getVRegs(1))); MachineIRBuilder B(MI); const RegisterBank *SrcBank = @@ -1788,23 +2543,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // breakdowns supported. DstTy.getSizeInBits() == 64 && SrcTy.getSizeInBits() <= 32) { - const LLT S32 = LLT::scalar(32); SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); // Extend to 32-bit, and then extend the low half. if (Signed) { // TODO: Should really be buildSExtOrCopy B.buildSExtOrTrunc(DefRegs[0], SrcReg); - - // Replicate sign bit from 32-bit extended part. - auto ShiftAmt = B.buildConstant(S32, 31); - MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); - B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); - } else { + } else if (Opc == AMDGPU::G_ZEXT) { B.buildZExtOrTrunc(DefRegs[0], SrcReg); - B.buildConstant(DefRegs[1], 0); + } else { + B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); } + extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); MRI.setRegBank(DstReg, *SrcBank); MI.eraseFromParent(); return; @@ -1813,6 +2564,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (SrcTy != LLT::scalar(1)) return; + // It is not legal to have a legalization artifact with a VCC source. Rather + // than introducing a copy, insert the select we would have to select the + // copy to. if (SrcBank == &AMDGPU::VCCRegBank) { SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); @@ -1834,7 +2588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstSize > 32) { B.buildSelect(DefRegs[0], SrcReg, True, False); - B.buildCopy(DefRegs[1], DefRegs[0]); + extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); } else if (DstSize < 32) { auto Sel = B.buildSelect(SelType, SrcReg, True, False); MRI.setRegBank(Sel.getReg(0), *DstBank); @@ -1847,24 +2601,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } - // Fixup the case with an s1 src that isn't a condition register. Use shifts - // instead of introducing a compare to avoid an unnecessary condition - // register (and since there's no scalar 16-bit compares). - auto Ext = B.buildAnyExt(DstTy, SrcReg); - auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); - auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); - - if (MI.getOpcode() == AMDGPU::G_SEXT) - B.buildAShr(DstReg, Shl, ShiftAmt); - else - B.buildLShr(DstReg, Shl, ShiftAmt); - - MRI.setRegBank(DstReg, *SrcBank); - MRI.setRegBank(Ext.getReg(0), *SrcBank); - MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); - MRI.setRegBank(Shl.getReg(0), *SrcBank); - MI.eraseFromParent(); - return; + break; } case AMDGPU::G_BUILD_VECTOR: case AMDGPU::G_BUILD_VECTOR_TRUNC: { @@ -1934,7 +2671,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT DstTy = MRI.getType(DstReg); + LLT SrcTy = MRI.getType(SrcReg); + + if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) + return; + MachineIRBuilder B(MI); const ValueMapping &DstMapping @@ -1942,10 +2688,26 @@ void AMDGPURegisterBankInfo::applyMappingImpl( const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; const RegisterBank *SrcBank = OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - Register IdxReg = MI.getOperand(2).getReg(); + const RegisterBank *IdxBank = + OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + + Register BaseIdxReg; + unsigned ConstOffset; + MachineInstr *OffsetDef; + std::tie(BaseIdxReg, ConstOffset, OffsetDef) = + AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); + + // See if the index is an add of a constant which will be foldable by moving + // the base register of the index later if this is going to be executed in a + // waterfall loop. This is essentially to reassociate the add of a constant + // with the readfirstlane. + bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && + ConstOffset > 0 && + ConstOffset < SrcTy.getNumElements(); + + // Move the base register. We'll re-insert the add later. + if (ShouldMoveIndexIntoLoop) + MI.getOperand(2).setReg(BaseIdxReg); // If this is a VGPR result only because the index was a VGPR result, the // actual indexing will be done on the SGPR source vector, which will @@ -1969,26 +2731,30 @@ void AMDGPURegisterBankInfo::applyMappingImpl( buildVCopy(B, DstReg, TmpReg); } + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) + reinsertVectorIndexAdd(B, MI, 2, ConstOffset); + return; } assert(DstTy.getSizeInBits() == 64); - LLT SrcTy = MRI.getType(SrcReg); - const LLT S32 = LLT::scalar(32); LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); auto CastSrc = B.buildBitcast(Vec32, SrcReg); auto One = B.buildConstant(S32, 1); + MachineBasicBlock::iterator MII = MI.getIterator(); + // Split the vector index into 32-bit pieces. Prepare to move all of the // new instructions into a waterfall loop if necessary. // // Don't put the bitcast or constant in the loop. - MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + MachineInstrSpan Span(MII, &B.getMBB()); // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). - auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxLo = B.buildShl(S32, BaseIdxReg, One); auto IdxHi = B.buildAdd(S32, IdxLo, One); auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); @@ -2029,33 +2795,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl( buildVCopy(B, DstRegs[1], TmpReg1); } + if (ShouldMoveIndexIntoLoop) + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); + return; } case AMDGPU::G_INSERT_VECTOR_ELT: { SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); + Register DstReg = MI.getOperand(0).getReg(); + LLT VecTy = MRI.getType(DstReg); + assert(OpdMapper.getVRegs(0).empty()); - assert(OpdMapper.getVRegs(1).empty()); assert(OpdMapper.getVRegs(3).empty()); - if (InsRegs.empty()) { - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 3 }); + if (substituteSimpleCopyRegs(OpdMapper, 1)) + MRI.setType(MI.getOperand(1).getReg(), VecTy); + + if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) return; - } - Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *IdxBank = + OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + Register SrcReg = MI.getOperand(1).getReg(); Register InsReg = MI.getOperand(2).getReg(); - Register IdxReg = MI.getOperand(3).getReg(); - LLT SrcTy = MRI.getType(SrcReg); LLT InsTy = MRI.getType(InsReg); (void)InsTy; + Register BaseIdxReg; + unsigned ConstOffset; + MachineInstr *OffsetDef; + std::tie(BaseIdxReg, ConstOffset, OffsetDef) = + AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); + + // See if the index is an add of a constant which will be foldable by moving + // the base register of the index later if this is going to be executed in a + // waterfall loop. This is essentially to reassociate the add of a constant + // with the readfirstlane. + bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && + ConstOffset > 0 && + ConstOffset < VecTy.getNumElements(); + + // Move the base register. We'll re-insert the add later. + if (ShouldMoveIndexIntoLoop) + MI.getOperand(3).setReg(BaseIdxReg); + + + if (InsRegs.empty()) { + executeInWaterfallLoop(MI, MRI, { 3 }); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) { + MachineIRBuilder B(MI); + reinsertVectorIndexAdd(B, MI, 3, ConstOffset); + } + + return; + } + + assert(InsTy.getSizeInBits() == 64); const LLT S32 = LLT::scalar(32); - LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); MachineIRBuilder B(MI); auto CastSrc = B.buildBitcast(Vec32, SrcReg); @@ -2068,12 +2871,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). - auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxLo = B.buildShl(S32, BaseIdxReg, One); auto IdxHi = B.buildAdd(S32, IdxLo, One); auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); - B.buildBitcast(DstReg, InsHi); const RegisterBank *DstBank = OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; @@ -2093,6 +2895,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( SmallSet<Register, 4> OpsToWaterfall; if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + B.setInsertPt(B.getMBB(), MI); + B.buildBitcast(DstReg, InsHi); MI.eraseFromParent(); return; } @@ -2100,17 +2904,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl( B.setInstr(*Span.begin()); MI.eraseFromParent(); + // Figure out the point after the waterfall loop before mangling the control + // flow. executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), OpsToWaterfall, MRI); + + // The insertion point is now right after the original instruction. + // + // Keep the bitcast to the original vector type out of the loop. Doing this + // saved an extra phi we don't need inside the loop. + B.buildBitcast(DstReg, InsHi); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); + + return; + } + case AMDGPU::G_AMDGPU_BUFFER_LOAD: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: + case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: + case AMDGPU::G_AMDGPU_BUFFER_STORE: + case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: + case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: + case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {1, 4}); + return; + } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {2, 5}); + return; + } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, {3, 6}); + return; + } + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + applyMappingSBufferLoad(OpdMapper); return; } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS - executeInWaterfallLoop(MI, MRI, { 2, 3 }); - return; - } case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); @@ -2132,18 +2989,51 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(MI, MRI, 3); // Index return; } - default: - break; + case Intrinsic::amdgcn_ballot: + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: { + applyDefaultMapping(OpdMapper); + + // Readlane for m0 value, which is always the last operand. + // FIXME: Should this be a waterfall loop instead? + constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index + return; + } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + // Doing a waterfall loop over these wouldn't make any sense. + substituteSimpleCopyRegs(OpdMapper, 2); + substituteSimpleCopyRegs(OpdMapper, 3); + constrainOpWithReadfirstlane(MI, MRI, 4); + constrainOpWithReadfirstlane(MI, MRI, 5); + return; + } + case Intrinsic::amdgcn_sbfe: + applyMappingBFEIntrinsic(OpdMapper, true); + return; + case Intrinsic::amdgcn_ubfe: + applyMappingBFEIntrinsic(OpdMapper, false); + return; } break; } + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + const AMDGPU::RsrcIntrinsic *RSrcIntrin + = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); + assert(RSrcIntrin && RSrcIntrin->IsImage); + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); + return; + } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { auto IntrID = MI.getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_buffer_load: { - executeInWaterfallLoop(MI, MRI, { 2 }); - return; - } case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { // This is only allowed to execute with 1 lane, so readfirstlane is safe. @@ -2167,28 +3057,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(MI, MRI, 1); // M0 return; } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } - case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_buffer_load_format: - case Intrinsic::amdgcn_raw_tbuffer_load: - case Intrinsic::amdgcn_raw_buffer_store: - case Intrinsic::amdgcn_raw_buffer_store_format: - case Intrinsic::amdgcn_raw_tbuffer_store: { - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {2, 4}); - return; - } - case Intrinsic::amdgcn_struct_buffer_load: - case Intrinsic::amdgcn_struct_buffer_store: - case Intrinsic::amdgcn_struct_tbuffer_load: - case Intrinsic::amdgcn_struct_tbuffer_store: { - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {2, 5}); + case Intrinsic::amdgcn_s_setreg: { + constrainOpWithReadfirstlane(MI, MRI, 2); return; } default: { @@ -2211,10 +3092,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case AMDGPU::G_LOAD: case AMDGPU::G_ZEXTLOAD: case AMDGPU::G_SEXTLOAD: { - if (applyMappingWideLoad(MI, OpdMapper, MRI)) + if (applyMappingLoad(MI, OpdMapper, MRI)) return; break; } + case AMDGPU::G_DYN_STACKALLOC: + applyMappingDynStackAlloc(MI, OpdMapper, MRI); + return; default: break; } @@ -2244,7 +3128,11 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); + const MachineOperand &SrcOp = MI.getOperand(i); + if (!SrcOp.isReg()) + continue; + + unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), @@ -2256,31 +3144,19 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); - unsigned OpdIdx = 0; - - unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); - OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); - - if (MI.getOperand(OpdIdx).isIntrinsicID()) - OpdsMapping[OpdIdx++] = nullptr; - Register Reg1 = MI.getOperand(OpdIdx).getReg(); - unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); - - unsigned DefaultBankID = Size1 == 1 ? - AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; - unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); - - OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); - - for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { - const MachineOperand &MO = MI.getOperand(OpdIdx); - if (!MO.isReg()) + // Even though we technically could use SGPRs, this would require knowledge of + // the constant bus restriction. Force all sources to VGPR (except for VCC). + // + // TODO: Unary ops are trivially OK, so accept SGPRs? + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + const MachineOperand &Src = MI.getOperand(i); + if (!Src.isReg()) continue; - unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI); + unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; - OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); + OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), @@ -2324,6 +3200,10 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, continue; Register OpReg = MI.getOperand(I).getReg(); + // We replace some dead address operands with $noreg + if (!OpReg) + continue; + unsigned Size = getSizeInBits(OpReg, MRI, *TRI); // FIXME: Probably need a new intrinsic register bank searchable table to @@ -2345,6 +3225,22 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); } +/// Return the mapping for a pointer arugment. +const RegisterBankInfo::ValueMapping * +AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, + Register PtrReg) const { + LLT PtrTy = MRI.getType(PtrReg); + unsigned Size = PtrTy.getSizeInBits(); + if (Subtarget.useFlatForGlobal() || + !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) + return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + + // If we're using MUBUF instructions for global memory, an SGPR base register + // is possible. Otherwise this needs to be a VGPR. + const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); + return AMDGPU::getValueMapping(PtrBank->getID(), Size); +} + const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { @@ -2352,7 +3248,6 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 2> OpdsMapping(2); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); - LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); Register PtrReg = MI.getOperand(1).getReg(); LLT PtrTy = MRI.getType(PtrReg); unsigned AS = PtrTy.getAddressSpace(); @@ -2364,14 +3259,23 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); if (PtrBank == &AMDGPU::SGPRRegBank && - (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && - AS != AMDGPUAS::PRIVATE_ADDRESS) && - isScalarLoadLegal(MI)) { - // We have a uniform instruction so we want to use an SMRD load - ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); - PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); + SITargetLowering::isFlatGlobalAddrSpace(AS)) { + if (isScalarLoadLegal(MI)) { + // We have a uniform instruction so we want to use an SMRD load + ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); + } else { + ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + + // If we're using MUBUF instructions for global memory, an SGPR base + // register is possible. Otherwise this needs to be a VGPR. + unsigned PtrBankID = Subtarget.useFlatForGlobal() ? + AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; + + PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); + } } else { - ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); + ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } @@ -2449,11 +3353,35 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, /// in RegBankSelect::Mode::Fast. Any mapping that would cause a /// VGPR to SGPR generated is illegal. /// +// Operands that must be SGPRs must accept potentially divergent VGPRs as +// legal. These will be dealt with in applyMappingImpl. +// const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (MI.isCopy()) { + // The default logic bothers to analyze impossible alternative mappings. We + // want the most straightforward mapping, so just directly handle this. + const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, + *TRI); + const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, + *TRI); + assert(SrcBank && "src bank should have been assigned already"); + if (!DstBank) + DstBank = SrcBank; + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + if (cannotCopy(*DstBank, *SrcBank, Size)) + return getInvalidInstructionMapping(); + + const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); + return getInstructionMapping( + 1, /*Cost*/ 1, + /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); + } + if (MI.isRegSequence()) { // If any input is a VGPR, the result must be a VGPR. The default handling // assumes any copy between banks is legal. @@ -2592,6 +3520,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLVM_FALLTHROUGH; } case AMDGPU::G_PTR_ADD: + case AMDGPU::G_PTRMASK: case AMDGPU::G_ADD: case AMDGPU::G_SUB: case AMDGPU::G_MUL: @@ -2608,6 +3537,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SMAX: case AMDGPU::G_UMIN: case AMDGPU::G_UMAX: + case AMDGPU::G_SHUFFLE_VECTOR: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); LLVM_FALLTHROUGH; @@ -2635,7 +3565,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? + case AMDGPU::G_FSHR: // TODO: Expand for scalar case AMDGPU::G_AMDGPU_FFBH_U32: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { @@ -2664,6 +3603,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); break; } + case AMDGPU::G_DYN_STACKALLOC: { + // Result is always uniform, and a wave reduction is needed for the source. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; @@ -2719,12 +3665,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: case AMDGPU::G_PTRTOINT: - case AMDGPU::G_CTLZ: - case AMDGPU::G_CTLZ_ZERO_UNDEF: - case AMDGPU::G_CTTZ: - case AMDGPU::G_CTTZ_ZERO_UNDEF: - case AMDGPU::G_CTPOP: - case AMDGPU::G_BSWAP: case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { @@ -2733,21 +3673,33 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); break; } + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTPOP: { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); + + // This should really be getValueMappingSGPR64Only, but allowing the generic + // code to handle the register split just makes using LegalizerHelper more + // difficult. + OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); + break; + } case AMDGPU::G_TRUNC: { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); unsigned Bank = getRegBankID(Src, MRI, *TRI); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); - OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ? - AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) : - AMDGPU::getValueMapping(Bank, DstSize); + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); break; } case AMDGPU::G_ZEXT: case AMDGPU::G_SEXT: - case AMDGPU::G_ANYEXT: { + case AMDGPU::G_ANYEXT: + case AMDGPU::G_SEXT_INREG: { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); @@ -2765,17 +3717,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } - // TODO: Should anyext be split into 32-bit part as well? - if (MI.getOpcode() == AMDGPU::G_ANYEXT) { - OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); - OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); - } else { - // Scalar extend can use 64-bit BFE, but VGPRs require extending to - // 32-bits, and then to 64. - OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); - OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), - SrcSize); - } + // Scalar extend can use 64-bit BFE, but VGPRs require extending to + // 32-bits, and then to 64. + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), + SrcSize); break; } case AMDGPU::G_FCMP: { @@ -2790,43 +3736,43 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_STORE: { assert(MI.getOperand(0).isReg()); unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - // FIXME: We need to specify a different reg bank once scalar stores - // are supported. + + // FIXME: We need to specify a different reg bank once scalar stores are + // supported. const ValueMapping *ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); - // FIXME: Depending on the type of store, the pointer could be in - // the SGPR Reg bank. - // FIXME: Pointer size should be based on the address space. - const ValueMapping *PtrMapping = - AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64); - OpdsMapping[0] = ValMapping; - OpdsMapping[1] = PtrMapping; + OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); break; } - case AMDGPU::G_ICMP: { auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + + // See if the result register has already been constrained to vcc, which may + // happen due to control flow intrinsic lowering. + unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID && + bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && + Op2Bank == AMDGPU::SGPRRegBankID && Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && Subtarget.hasScalarCompareEq64())); - unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; + DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; + unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; // TODO: Use 32-bit for scalar output size. // SCC results will need to be copied to a 32-bit SGPR virtual register. const unsigned ResultSize = 1; - OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize); - OpdsMapping[1] = nullptr; // Predicate Operand. - OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); - OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); + OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); + OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { @@ -2852,15 +3798,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, - InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); + + // This is a weird case, because we need to break down the mapping based on + // the register bank of a different operand. + if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { + OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, + InsertSize); + } else { + assert(InsertSize == 32 || InsertSize == 64); + OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); + } // The index can be either if the source vector is VGPR. OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); @@ -2878,6 +3831,116 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } + case AMDGPU::G_AMDGPU_BUFFER_LOAD: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: + case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: + case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: + case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: + case AMDGPU::G_AMDGPU_BUFFER_STORE: + case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: + case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: + case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: + case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // rsrc + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + + // Any remaining operands are immediates and were correctly null + // initialized. + break; + } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + // vdata_out + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // vdata_in + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // rsrc + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + + // Any remaining operands are immediates and were correctly null + // initialized. + break; + } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { + // vdata_out + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + + // vdata_in + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + + // cmp + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // rsrc + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // vindex + OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + + // voffset + OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); + + // soffset + OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); + + // Any remaining operands are immediates and were correctly null + // initialized. + break; + } + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // We need to convert this to a MUBUF if either the resource of offset is + // VGPR. + unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); + unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); + unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); + break; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { default: @@ -2890,9 +3953,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_log_clamp: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sqrt: case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: @@ -2911,8 +3976,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_fmad_ftz: case Intrinsic::amdgcn_mbcnt_lo: case Intrinsic::amdgcn_mbcnt_hi: - case Intrinsic::amdgcn_ubfe: - case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_mul_u24: case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_lerp: @@ -2933,13 +3996,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: - case Intrinsic::amdgcn_wwm: - case Intrinsic::amdgcn_wqm: + return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_ubfe: + if (isSALUMapping(MI)) + return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_ds_swizzle: case Intrinsic::amdgcn_ds_permute: case Intrinsic::amdgcn_ds_bpermute: case Intrinsic::amdgcn_update_dpp: + case Intrinsic::amdgcn_mov_dpp8: + case Intrinsic::amdgcn_mov_dpp: + case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_wqm: + case Intrinsic::amdgcn_softwqm: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_kernarg_segment_ptr: case Intrinsic::amdgcn_s_getpc: @@ -2954,26 +4025,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); break; } - case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS - Register RSrc = MI.getOperand(2).getReg(); // SGPR - Register Offset = MI.getOperand(3).getReg(); // SGPR/imm - - unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); - unsigned Size3 = MRI.getType(Offset).getSizeInBits(); - - unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); - unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); - - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); - OpdsMapping[1] = nullptr; // intrinsic id - - // Lie and claim everything is legal, even though some need to be - // SGPRs. applyMapping will have to deal with it as a waterfall loop. - OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc - OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); - OpdsMapping[4] = nullptr; + case Intrinsic::amdgcn_ps_live: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; } case Intrinsic::amdgcn_div_scale: { @@ -2983,11 +4036,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - OpdsMapping[3] = AMDGPU::getValueMapping( - getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize); - OpdsMapping[4] = AMDGPU::getValueMapping( - getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize); - + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); break; } case Intrinsic::amdgcn_class: { @@ -2997,10 +4047,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); - OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI), - Src0Size); - OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI), - Src1Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); break; } case Intrinsic::amdgcn_icmp: @@ -3009,10 +4057,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // This is not VCCRegBank because this is not used in boolean contexts. OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); - unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); - OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize); - OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); break; } case Intrinsic::amdgcn_readlane: { @@ -3054,6 +4100,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_mfma_f32_4x4x1f32: case Intrinsic::amdgcn_mfma_f32_4x4x4f16: case Intrinsic::amdgcn_mfma_i32_4x4x4i8: @@ -3086,9 +4142,46 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: { + const int M0Idx = MI.getNumOperands() - 1; + Register M0Reg = MI.getOperand(M0Idx).getReg(); + unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // Must be SGPR, but we must take whatever the original bank is and fix it + // later. + OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } + case Intrinsic::amdgcn_ballot: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); + break; + } } break; } + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + auto IntrID = MI.getIntrinsicID(); + const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); + assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); + // Non-images can have complications from operands that allow both SGPR + // and VGPR. For now it's too complicated to figure out the final opcode + // to derive the register bank from the MCInstrDesc. + assert(RSrcIntrin->IsImage); + return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); + } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { auto IntrID = MI.getIntrinsicID(); switch (IntrID) { @@ -3100,13 +4193,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_ds_append: - case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { @@ -3118,17 +4207,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); break; } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_exp_compr: - OpdsMapping[0] = nullptr; // IntrinsicID - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - // FIXME: Could we support packed types here? OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); - // FIXME: These are immediate values which can't be read from registers. - OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); - OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); break; case Intrinsic::amdgcn_exp: // FIXME: Could we support packed types here? @@ -3137,31 +4225,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); break; - case Intrinsic::amdgcn_buffer_load: { - Register RSrc = MI.getOperand(2).getReg(); // SGPR - Register VIndex = MI.getOperand(3).getReg(); // VGPR - Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm - - unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); - unsigned Size3 = MRI.getType(VIndex).getSizeInBits(); - unsigned Size4 = MRI.getType(Offset).getSizeInBits(); - - unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); - unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); - - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); - OpdsMapping[1] = nullptr; // intrinsic id - - // Lie and claim everything is legal, even though some need to be - // SGPRs. applyMapping will have to deal with it as a waterfall loop. - OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc - OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3); - OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4); - OpdsMapping[5] = nullptr; - OpdsMapping[6] = nullptr; - break; - } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // This must be an SGPR, but accept a VGPR. @@ -3170,8 +4233,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: - case Intrinsic::amdgcn_init_exec: { + case Intrinsic::amdgcn_s_setreg: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } + case Intrinsic::amdgcn_end_cf: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; @@ -3227,7 +4296,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_init_exec_from_input: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); - OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } case Intrinsic::amdgcn_ds_gws_init: @@ -3251,15 +4319,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } default: - if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = - AMDGPU::lookupRsrcIntrinsic(IntrID)) { - // Non-images can have complications from operands that allow both SGPR - // and VGPR. For now it's too complicated to figure out the final opcode - // to derive the register bank from the MCInstrDesc. - if (RSrcIntrin->IsImage) - return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); - } - return getInvalidInstructionMapping(); } break; @@ -3319,9 +4378,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: - case AMDGPU::G_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { - return getDefaultMappingAllVGPR(MI); + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } + case AMDGPU::G_ATOMIC_CMPXCHG: { + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + break; } case AMDGPU::G_BRCOND: { unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 1ac7d3652a8b3..8f38ec4eeb3a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -69,13 +69,20 @@ public: void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const; - bool applyMappingWideLoad(MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, - MachineRegisterInfo &MRI) const; + bool applyMappingDynStackAlloc(MachineInstr &MI, + const OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; + bool applyMappingLoad(MachineInstr &MI, + const OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; bool applyMappingImage(MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; + bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; + + bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper, + bool Signed) const; void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const; @@ -91,6 +98,9 @@ public: /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + const ValueMapping *getValueMappingForPtr(const MachineRegisterInfo &MRI, + Register Ptr) const; + const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const; @@ -168,6 +178,15 @@ public: const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override; + +private: + + bool foldExtractEltToCmpSelect(MachineInstr &MI, + MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const; + bool foldInsertEltToCmpSelect(MachineInstr &MI, + MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const; }; } // End llvm namespace. #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td index c495316c5bce0..9f6ebd00cd97b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,16 +7,16 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024] + [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024] + [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024] >; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; def AGPRRegBank : RegisterBank <"AGPR", - [AGPR_32, AReg_64, AReg_128, AReg_512, AReg_1024] + [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp deleted file mode 100644 index 9806e6b0714f6..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ /dev/null @@ -1,142 +0,0 @@ -//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Parent TargetRegisterInfo class common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" - -using namespace llvm; - -AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} - -//===----------------------------------------------------------------------===// -// Function handling callbacks - Functions are a seldom used feature of GPUS, so -// they are not supported at this time. -//===----------------------------------------------------------------------===// - -// Table of NumRegs sized pieces at every 32-bit offset. -static const uint16_t SubRegFromChannelTable[][32] = { - { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, - AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, - AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31 - }, - { - AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4, - AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, - AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, - AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16, - AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, - AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, - AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28, - AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister - }, - { - AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, - AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, - AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, - AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, - AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, - AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, - AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, - AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister - }, - { - AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, - AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, - AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, - AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, - AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, - AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, - AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, - AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister - } -}; - -// FIXME: TableGen should generate something to make this manageable for all -// register classes. At a minimum we could use the opposite of -// composeSubRegIndices and go up from the base 32-bit subreg. -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) { - const unsigned NumRegIndex = NumRegs - 1; - - assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && - "Not implemented"); - assert(Channel < array_lengthof(SubRegFromChannelTable[0])); - return SubRegFromChannelTable[NumRegIndex][Channel]; -} - -void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { - MCRegAliasIterator R(Reg, this, true); - - for (; R.isValid(); ++R) - Reserved.set(*R); -} - -#define GET_REGINFO_TARGET_DESC -#include "AMDGPUGenRegisterInfo.inc" - -// Forced to be here by one .inc -const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( - const MachineFunction *MF) const { - CallingConv::ID CC = MF->getFunction().getCallingConv(); - switch (CC) { - case CallingConv::C: - case CallingConv::Fast: - case CallingConv::Cold: - return CSR_AMDGPU_HighRegs_SaveList; - default: { - // Dummy to not crash RegisterClassInfo. - static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; - return &NoCalleeSavedReg; - } - } -} - -const MCPhysReg * -SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { - return nullptr; -} - -const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, - CallingConv::ID CC) const { - switch (CC) { - case CallingConv::C: - case CallingConv::Fast: - case CallingConv::Cold: - return CSR_AMDGPU_HighRegs_RegMask; - default: - return nullptr; - } -} - -Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const SIFrameLowering *TFI = - MF.getSubtarget<GCNSubtarget>().getFrameLowering(); - const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() - : FuncInfo->getStackPtrOffsetReg(); -} - -const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { - return CSR_AMDGPU_AllVGPRs_RegMask; -} - -const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { - return CSR_AMDGPU_AllAllocatableSRegs_RegMask; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h deleted file mode 100644 index 9e713ca804a11..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ /dev/null @@ -1,38 +0,0 @@ -//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// TargetRegisterInfo interface that is implemented by all hw codegen -/// targets. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H - -#define GET_REGINFO_HEADER -#include "AMDGPUGenRegisterInfo.inc" - -namespace llvm { - -class GCNSubtarget; -class TargetInstrInfo; - -struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - AMDGPURegisterInfo(); - - /// \returns the sub reg enum value for the given \p Channel - /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); - - void reserveRegisterTuples(BitVector &, unsigned Reg) const; -}; - -} // End namespace llvm - -#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td deleted file mode 100644 index ab71b7aa8a572..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ /dev/null @@ -1,21 +0,0 @@ -//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Tablegen register definitions common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let Namespace = "AMDGPU" in { - -foreach Index = 0-31 in { - def sub#Index : SubRegIndex<32, !shl(Index, 5)>; -} - -} - -include "SIRegisterInfo.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 9a1e2fc42ed57..9c3d96de6d68a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -208,8 +208,8 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) { #ifndef NDEBUG bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const { - VectorType *VT0 = dyn_cast<VectorType>(Ty0); - VectorType *VT1 = dyn_cast<VectorType>(Ty1); + auto *VT0 = dyn_cast<FixedVectorType>(Ty0); + auto *VT1 = dyn_cast<FixedVectorType>(Ty1); if (!VT0 || !VT1) return false; @@ -409,7 +409,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { DL->getTypeSizeInBits(Val->getType())) { assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType())); Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()), - { 0, 1, 2 }); + ArrayRef<int>{0, 1, 2}); } Val = B.CreateBitCast(Val, EffectiveEltTy); @@ -453,9 +453,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { PointerType *ArgType = cast<PointerType>(Arg.getType()); auto *EltTy = ArgType->getElementType(); - unsigned Align = Arg.getParamAlignment(); - if (Align == 0) - Align = DL->getABITypeAlignment(EltTy); + const auto Align = + DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); Value *Val = B.CreateExtractValue(StubCall, RetIdx++); Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 8d70536ec21c5..bc68310b2f5ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -198,6 +198,7 @@ def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; def : SourceOfDivergence<int_amdgcn_atomic_inc>; def : SourceOfDivergence<int_amdgcn_atomic_dec>; +def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; @@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_ds_swizzle>; def : SourceOfDivergence<int_amdgcn_ds_ordered_add>; @@ -247,6 +249,7 @@ def : SourceOfDivergence<int_amdgcn_permlanex16>; def : SourceOfDivergence<int_amdgcn_mov_dpp>; def : SourceOfDivergence<int_amdgcn_mov_dpp8>; def : SourceOfDivergence<int_amdgcn_update_dpp>; +def : SourceOfDivergence<int_amdgcn_writelane>; def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; @@ -270,5 +273,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>; def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>; def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>; +// The dummy boolean output is divergent from the IR's perspective, +// but the mask results are uniform. These produce a divergent and +// uniform result, so the returned struct is collectively divergent. +// isAlwaysUniform can override the extract of the uniform component. +def : SourceOfDivergence<int_amdgcn_if>; +def : SourceOfDivergence<int_amdgcn_else>; +def : SourceOfDivergence<int_amdgcn_loop>; + foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 445e91092499a..213788ae0f67b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -59,13 +59,6 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP32Denormals = false; - } - HasMulU24 = getGeneration() >= EVERGREEN; HasMulI24 = hasCaymanISA(); @@ -76,9 +69,6 @@ GCNSubtarget & GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics - // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be - // enabled, but some instructions do not respect them and they run at the - // double precision rate, so don't enable by default. // // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you @@ -88,20 +78,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // unset everything else if it is disabled // Assuming ECC is enabled is the conservative default. - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - FullFS += "+fp64-fp16-denormals,"; - } else { - FullFS += "-fp32-denormals,"; - } - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS // Disable mutually exclusive bits. @@ -145,12 +126,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, } // Don't crash on invalid devices. - if (WavefrontSize == 0) - WavefrontSize = 64; + if (WavefrontSizeLog2 == 0) + WavefrontSizeLog2 = 5; HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - if (DoesNotSupportXNACK && EnableXNACK) { + // Disable XNACK on targets where it is not enabled by default unless it is + // explicitly requested. + if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { ToggleFeature(AMDGPU::FeatureXNACK); EnableXNACK = false; } @@ -170,8 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT), Has16BitInsts(false), HasMadMixInsts(false), - FP32Denormals(false), - FPExceptions(false), + HasMadMacF32Insts(false), + HasDsSrc2Insts(false), HasSDWA(false), HasVOP3PInsts(false), HasMulI24(true), @@ -182,7 +165,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasTrigReducedRange(false), MaxWavesPerEU(10), LocalMemorySize(0), - WavefrontSize(0) + WavefrontSizeLog2(0) { } GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, @@ -196,9 +179,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, MaxPrivateElementSize(0), FastFMAF32(false), + FastDenormalF32(false), HalfRate64Ops(false), - FP64FP16Denormals(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -224,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, GFX8Insts(false), GFX9Insts(false), GFX10Insts(false), + GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -241,7 +225,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasDPP(false), HasDPP8(false), HasR128A16(false), + HasGFX10A16(false), + HasG16(false), HasNSAEncoding(false), + GFX10_BEncoding(false), HasDLInsts(false), HasDot1Insts(false), HasDot2Insts(false), @@ -256,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), + HasGetWaveIdInst(false), + HasSMemTimeInst(false), HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), @@ -287,6 +276,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); InstSelector.reset(new AMDGPUInstructionSelector( @@ -325,18 +315,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } +// FIXME: Should return min,max range. unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { - unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; - unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); - if (!WorkGroupsPerCu) + const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; + const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); + if (!MaxWorkGroupsPerCu) return 0; - unsigned MaxWaves = getMaxWavesPerEU(); - unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; - unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); - NumWaves = std::min(NumWaves, MaxWaves); - NumWaves = std::max(NumWaves, 1u); - return NumWaves; + + const unsigned WaveSize = getWavefrontSize(); + + // FIXME: Do we need to account for alignment requirement of LDS rounding the + // size up? + // Compute restriction based on LDS usage + unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); + + // This can be queried with more LDS than is possible, so just assume the + // worst. + if (NumGroups == 0) + return 1; + + NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); + + // Round to the number of waves. + const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; + unsigned MaxWaves = NumGroups * MaxGroupNumWaves; + + // Clamp to the maximum possible number of waves. + MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); + + // FIXME: Needs to be a multiple of the group size? + //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); + + assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && + "computed invalid occupancy"); + return MaxWaves; } unsigned @@ -396,13 +409,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // number of waves per execution unit to values implied by requested // minimum/maximum flat work group sizes. unsigned MinImpliedByFlatWorkGroupSize = - getMaxWavesPerEU(FlatWorkGroupSizes.second); - bool RequestedFlatWorkGroupSize = false; - - if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { - Default.first = MinImpliedByFlatWorkGroupSize; - RequestedFlatWorkGroupSize = true; - } + getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); + Default.first = MinImpliedByFlatWorkGroupSize; + bool RequestedFlatWorkGroupSize = + F.hasFnAttribute("amdgpu-flat-work-group-size"); // Requested minimum/maximum number of waves per execution unit. std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( @@ -414,9 +424,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // Make sure requested values do not violate subtarget's specifications. if (Requested.first < getMinWavesPerEU() || - Requested.first > getMaxWavesPerEU()) - return Default; - if (Requested.second > getMaxWavesPerEU()) + Requested.second > getMaxWavesPerEU()) return Default; // Make sure requested values are compatible with values implied by requested @@ -497,12 +505,12 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, const DataLayout &DL = F.getParent()->getDataLayout(); uint64_t ExplicitArgBytes = 0; - MaxAlign = Align::None(); + MaxAlign = Align(1); for (const Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); - const Align Alignment(DL.getABITypeAlignment(ArgTy)); + const Align Alignment = DL.getABITypeAlign(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; MaxAlign = std::max(MaxAlign, Alignment); @@ -622,13 +630,12 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } -unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, - unsigned LDSSize, +unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, unsigned NumSGPRs, unsigned NumVGPRs) const { unsigned Occupancy = std::min(getMaxWavesPerEU(), - getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + getOccupancyWithLocalMemSize(LDSSize, F)); if (NumSGPRs) Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); if (NumVGPRs) @@ -716,20 +723,20 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return MaxNumVGPRs; } -void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, - SDep &Dep) const { +void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, + int UseOpIdx, SDep &Dep) const { if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || - !Src->isInstr() || !Dst->isInstr()) + !Def->isInstr() || !Use->isInstr()) return; - MachineInstr *SrcI = Src->getInstr(); - MachineInstr *DstI = Dst->getInstr(); + MachineInstr *DefI = Def->getInstr(); + MachineInstr *UseI = Use->getInstr(); - if (SrcI->isBundle()) { + if (DefI->isBundle()) { const SIRegisterInfo *TRI = getRegisterInfo(); auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); - MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); + MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); + MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); unsigned Lat = 0; for (++I; I != E && I->isBundledWithPred(); ++I) { if (I->modifiesRegister(Reg, TRI)) @@ -738,12 +745,12 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, --Lat; } Dep.setLatency(Lat); - } else if (DstI->isBundle()) { + } else if (UseI->isBundle()) { const SIRegisterInfo *TRI = getRegisterInfo(); auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); - MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); - unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); + MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); + MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); + unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { if (I->readsRegister(Reg, TRI)) break; @@ -754,53 +761,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, } namespace { -struct MemOpClusterMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} - - void apply(ScheduleDAGInstrs *DAG) override { - SUnit *SUa = nullptr; - // Search for two consequent memory operations and link them - // to prevent scheduler from moving them apart. - // In DAG pre-process SUnits are in the original order of - // the instructions before scheduling. - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MI2 = *SU.getInstr(); - if (!MI2.mayLoad() && !MI2.mayStore()) { - SUa = nullptr; - continue; - } - if (!SUa) { - SUa = &SU; - continue; - } - - MachineInstr &MI1 = *SUa->getInstr(); - if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || - (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || - (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || - (TII->isDS(MI1) && TII->isDS(MI2))) { - SU.addPredBarrier(SUa); - - for (const SDep &SI : SU.Preds) { - if (SI.getSUnit() != SUa) - SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); - } - - if (&SU != &DAG->ExitSU) { - for (const SDep &SI : SUa->Succs) { - if (SI.getSUnit() != &SU) - SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); - } - } - } - - SUa = &SU; - } - } -}; - struct FillMFMAShadowMutation : ScheduleDAGMutation { const SIInstrInfo *TII; @@ -927,7 +887,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { - Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 19a240800ba14..c833bfbcf9366 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600FrameLowering.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" @@ -24,6 +25,7 @@ #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -65,8 +67,8 @@ private: protected: bool Has16BitInsts; bool HasMadMixInsts; - bool FP32Denormals; - bool FPExceptions; + bool HasMadMacF32Insts; + bool HasDsSrc2Insts; bool HasSDWA; bool HasVOP3PInsts; bool HasMulI24; @@ -77,7 +79,7 @@ protected: bool HasTrigReducedRange; unsigned MaxWavesPerEU; int LocalMemorySize; - unsigned WavefrontSize; + char WavefrontSizeLog2; public: AMDGPUSubtarget(const Triple &TT); @@ -140,6 +142,10 @@ public: return isAmdHsaOS() || isMesaKernel(F); } + bool isGCN() const { + return TargetTriple.getArch() == Triple::amdgcn; + } + bool has16BitInsts() const { return Has16BitInsts; } @@ -148,17 +154,12 @@ public: return HasMadMixInsts; } - bool hasFP32Denormals(const Function &F) const { - // FIXME: This should not be a property of the subtarget. This should be a - // property with a default set by the calling convention which can be - // overridden by attributes. For now, use the subtarget feature as a - // placeholder attribute. The function arguments only purpose is to - // discourage use without a function context until this is removed. - return FP32Denormals; + bool hasMadMacF32Insts() const { + return HasMadMacF32Insts || !isGCN(); } - bool hasFPExceptions() const { - return FPExceptions; + bool hasDsSrc2Insts() const { + return HasDsSrc2Insts; } bool hasSDWA() const { @@ -194,7 +195,11 @@ public: } unsigned getWavefrontSize() const { - return WavefrontSize; + return 1 << WavefrontSizeLog2; + } + + unsigned getWavefrontSizeLog2() const { + return WavefrontSizeLog2; } int getLocalMemorySize() const { @@ -221,9 +226,10 @@ public: /// \returns Maximum flat work group size supported by the subtarget. virtual unsigned getMaxFlatWorkGroupSize() const = 0; - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; + /// \returns Number of waves per execution unit required to support the given + /// \p FlatWorkGroupSize. + virtual unsigned + getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0; /// \returns Minimum number of waves per execution unit supported by the /// subtarget. @@ -246,6 +252,13 @@ public: uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; + /// \returns Corresponsing DWARF register number mapping flavour for the + /// \p WavefrontSize. + AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const { + return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 + : AMDGPUDwarfFlavour::Wave64; + } + virtual ~AMDGPUSubtarget() {} }; @@ -278,6 +291,7 @@ public: private: /// GlobalISel related APIs. std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; std::unique_ptr<InstructionSelector> InstSelector; std::unique_ptr<LegalizerInfo> Legalizer; std::unique_ptr<RegisterBankInfo> RegBankInfo; @@ -292,10 +306,10 @@ protected: // Possibly statically set by tablegen, but may want to be overridden. bool FastFMAF32; + bool FastDenormalF32; bool HalfRate64Ops; // Dynamially set bits that enable features. - bool FP64FP16Denormals; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -325,6 +339,7 @@ protected: bool GFX8Insts; bool GFX9Insts; bool GFX10Insts; + bool GFX10_3Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; @@ -342,7 +357,10 @@ protected: bool HasDPP; bool HasDPP8; bool HasR128A16; + bool HasGFX10A16; + bool HasG16; bool HasNSAEncoding; + bool GFX10_BEncoding; bool HasDLInsts; bool HasDot1Insts; bool HasDot2Insts; @@ -357,6 +375,8 @@ protected: bool DoesNotSupportSRAMECC; bool HasNoSdstCMPX; bool HasVscnt; + bool HasGetWaveIdInst; + bool HasSMemTimeInst; bool HasRegisterBanking; bool HasVOP3Literal; bool HasNoDataDepHazard; @@ -426,6 +446,10 @@ public: return CallLoweringInfo.get(); } + const InlineAsmLowering *getInlineAsmLowering() const override { + return InlineAsmLoweringInfo.get(); + } + InstructionSelector *getInstructionSelector() const override { return InstSelector.get(); } @@ -453,10 +477,6 @@ public: return (Generation)Gen; } - unsigned getWavefrontSizeLog2() const { - return Log2_32(WavefrontSize); - } - /// Return the number of high bits known to be zero fror a frame index. unsigned getKnownHighZeroBitsForFrameIndex() const { return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); @@ -506,6 +526,10 @@ public: return getGeneration() >= VOLCANIC_ISLANDS; } + bool hasFractBug() const { + return getGeneration() == SOUTHERN_ISLANDS; + } + bool hasBFE() const { return true; } @@ -587,6 +611,11 @@ public: return getGeneration() <= SEA_ISLANDS; } + /// Writes to VCC_LO/VCC_HI update the VCCZ flag. + bool partialVCCWritesUpdateVCCZ() const { + return getGeneration() >= GFX10; + } + /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR /// was written by a VALU instruction. bool hasSMRDReadVALUDefHazard() const { @@ -617,20 +646,6 @@ public: unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Alias for hasFP64FP16Denormals - bool hasFP16Denormals(const Function &F) const { - return FP64FP16Denormals; - } - - /// Alias for hasFP64FP16Denormals - bool hasFP64Denormals(const Function &F) const { - return FP64FP16Denormals; - } - - bool hasFP64FP16Denormals(const Function &F) const { - return FP64FP16Denormals; - } - bool supportsMinMaxDenormModes() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } @@ -724,6 +739,18 @@ public: return ScalarFlatScratchInsts; } + bool hasGlobalAddTidInsts() const { + return GFX10_BEncoding; + } + + bool hasAtomicCSub() const { + return GFX10_BEncoding; + } + + bool hasMultiDwordFlatScratchAddressing() const { + return getGeneration() >= GFX9; + } + bool hasFlatSegmentOffsetBug() const { return HasFlatSegmentOffsetBug; } @@ -853,6 +880,14 @@ public: return HasVscnt; } + bool hasGetWaveIdInst() const { + return HasGetWaveIdInst; + } + + bool hasSMemTimeInst() const { + return HasSMemTimeInst; + } + bool hasRegisterBanking() const { return HasRegisterBanking; } @@ -890,30 +925,6 @@ public: void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } - /// \returns Number of execution units per compute unit supported by the - /// subtarget. - unsigned getEUsPerCU() const { - return AMDGPU::IsaInfo::getEUsPerCU(this); - } - - /// \returns Maximum number of waves per compute unit supported by the - /// subtarget without any kind of limitation. - unsigned getMaxWavesPerCU() const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(this); - } - - /// \returns Maximum number of waves per compute unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); - } - - /// \returns Number of waves per work group supported by the subtarget and - /// limited by given \p FlatWorkGroupSize. - unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); - } - // static wrappers static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); @@ -979,6 +990,14 @@ public: return HasR128A16; } + bool hasGFX10A16() const { + return HasGFX10A16; + } + + bool hasA16() const { return hasR128A16() || hasGFX10A16(); } + + bool hasG16() const { return HasG16; } + bool hasOffset3fBug() const { return HasOffset3fBug; } @@ -987,6 +1006,14 @@ public: return HasNSAEncoding; } + bool hasGFX10_BEncoding() const { + return GFX10_BEncoding; + } + + bool hasGFX10_3Insts() const { + return GFX10_3Insts; + } + bool hasMadF16() const; bool enableSIScheduler() const { @@ -1059,6 +1086,8 @@ public: return HasNSAtoVMEMBug; } + bool hasHardClauses() const { return getGeneration() >= GFX10; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -1071,7 +1100,7 @@ public: /// registers if provided. /// Note, occupancy can be affected by the scratch allocation as well, but /// we do not have enough information to compute it. - unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, + unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; /// \returns true if the flat_scratch register should be initialized with the @@ -1178,7 +1207,7 @@ public: const override; bool isWave32() const { - return WavefrontSize == 32; + return getWavefrontSize() == 32; } const TargetRegisterClass *getBoolRC() const { @@ -1201,10 +1230,11 @@ public: return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { - return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); + /// \returns Number of waves per execution unit required to support the given + /// \p FlatWorkGroupSize. + unsigned + getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); } /// \returns Minimum number of waves per execution unit supported by the @@ -1213,7 +1243,8 @@ public: return AMDGPU::IsaInfo::getMinWavesPerEU(this); } - void adjustSchedDependency(SUnit *Src, SUnit *Dst, SDep &Dep) const override; + void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, + SDep &Dep) const override; }; class R600Subtarget final : public R600GenSubtargetInfo, @@ -1338,10 +1369,11 @@ public: return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { - return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); + /// \returns Number of waves per execution unit required to support the given + /// \p FlatWorkGroupSize. + unsigned + getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); } /// \returns Minimum number of waves per execution unit supported by the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index eb30d659bf0b5..b4b10835837cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" +#include "AMDGPUExportClustering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPUMacroFusion.h" @@ -23,6 +24,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600MachineScheduler.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" @@ -30,6 +32,7 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/Passes.h" @@ -138,6 +141,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt( cl::init(true), cl::Hidden); +static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt( + "amdgpu-fixed-function-abi", + cl::desc("Enable all implicit function arguments"), + cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), + cl::init(false), + cl::Hidden); + // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -183,6 +193,11 @@ static cl::opt<bool> EnableScalarIRPasses( cl::init(true), cl::Hidden); +static cl::opt<bool> EnableStructurizerWorkarounds( + "amdgpu-enable-structurizer-workarounds", + cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -217,23 +232,29 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); + initializeAMDGPUPostLegalizerCombinerPass(*PR); + initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); + initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIRemoveShortExecBranchesPass(*PR); + initializeSIPreEmitPeepholePass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); + initializeSIPostRABundlerPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -243,6 +264,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); + initializeSIAddIMGInitPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -264,6 +286,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; } @@ -363,10 +386,17 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); + if (TT.getArch() == Triple::amdgcn) { + if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); + else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); + } } bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; +bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -416,20 +446,19 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { } PM.add(createAMDGPUUnifyMetadataPass()); PM.add(createAMDGPUPrintfRuntimeBinding()); - PM.add(createAMDGPUPropagateAttributesLatePass(this)); - if (Internalize) { + if (Internalize) PM.add(createInternalizePass(mustPreserveGV)); + PM.add(createAMDGPUPropagateAttributesLatePass(this)); + if (Internalize) PM.add(createGlobalDCEPass()); - } if (EarlyInline) PM.add(createAMDGPUAlwaysInlinePass(false)); }); - const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, - legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); @@ -437,12 +466,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); }); Builder.addExtension( PassManagerBuilder::EP_CGSCCOptimizerLate, - [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) { // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); @@ -450,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { // This should run after inlining to have any chance of doing anything, // and before other cleanup optimizations. PM.add(createAMDGPULowerKernelAttributesPass()); + + // Promote alloca to vector before SROA and loop unroll. If we manage + // to eliminate allocas before unroll we may choose to unroll less. + if (EnableOpt) + PM.add(createAMDGPUPromoteAllocaToVector()); }); } @@ -617,7 +651,9 @@ public: bool addILPOpts() override; bool addInstSelector() override; bool addIRTranslator() override; + void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; + void addPreRegBankSelect() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; @@ -751,10 +787,15 @@ void AMDGPUPassConfig::addCodeGenPrepare() { if (EnableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); + + // LowerSwitch pass may introduce unreachable blocks that can + // cause unexpected behavior for subsequent passes. Placing it + // here seems better that these blocks would get cleaned up by + // UnreachableBlockElim inserted next in the pass flow. + addPass(createLowerSwitchPass()); } bool AMDGPUPassConfig::addPreISel() { - addPass(createLowerSwitchPass()); addPass(createFlattenCFGPass()); return false; } @@ -836,7 +877,11 @@ bool GCNPassConfig::addPreISel() { // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); if (!LateCFGStructurize) { - addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + if (EnableStructurizerWorkarounds) { + addPass(createFixIrreduciblePass()); + addPass(createUnifyLoopExitsPass()); + } + addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions } addPass(createSinkingPass()); addPass(createAMDGPUAnnotateUniformValues()); @@ -885,6 +930,12 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + // TODO: We have to add FinalizeISel + // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel + // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded. + // Will be removed as soon as SIFixupVectorISel is changed + // to work with V_ADD/SUB_U64_PSEUDO instead. + addPass(&FinalizeISelID); addPass(createSIFixupVectorISelPass()); addPass(createSIAddIMGInitPass()); return false; @@ -895,11 +946,22 @@ bool GCNPassConfig::addIRTranslator() { return false; } +void GCNPassConfig::addPreLegalizeMachineIR() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); + addPass(new Localizer()); +} + bool GCNPassConfig::addLegalizeMachineIR() { addPass(new Legalizer()); return false; } +void GCNPassConfig::addPreRegBankSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); +} + bool GCNPassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; @@ -933,12 +995,9 @@ void GCNPassConfig::addFastRegAlloc() { } void GCNPassConfig::addOptimizedRegAlloc() { - if (OptExecMaskPreRA) { + if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); - } else { - insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); - } + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of @@ -973,6 +1032,7 @@ void GCNPassConfig::addPostRegAlloc() { } void GCNPassConfig::addPreSched2() { + addPass(&SIPostRABundlerID); } void GCNPassConfig::addPreEmitPass() { @@ -993,9 +1053,12 @@ void GCNPassConfig::addPreEmitPass() { // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIInsertHardClausesID); addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); + addPass(&SIPreEmitPeepholeID); addPass(&BranchRelaxationPassID); } @@ -1024,11 +1087,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->initializeBaseYamlFields(YamlMFI); - auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { - if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { + auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { + Register TempReg; + if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { SourceRange = RegName.SourceRange; return true; } + RegVal = TempReg; return false; }; @@ -1046,7 +1111,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || - parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; @@ -1056,11 +1120,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } - if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && - !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { - return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); - } - if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); @@ -1080,7 +1139,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return false; if (A->IsRegister) { - unsigned Reg; + Register Reg; if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { SourceRange = A->RegisterName.SourceRange; return true; @@ -1154,8 +1213,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->Mode.IEEE = YamlMFI.Mode.IEEE; MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; - MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals; - MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals; + MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; + MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; + MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; + MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 70fa3961236f2..e223fecc88195 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -39,6 +39,7 @@ protected: public: static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; + static bool EnableFixedFunctionABI; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -56,8 +57,9 @@ public: void adjustPassManager(PassManagerBuilder &) override; /// Get the integer value of a null pointer in the given address space. - uint64_t getNullPointerValue(unsigned AddrSpace) const { + static int64_t getNullPointerValue(unsigned AddrSpace) { return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0; } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index 819bebb7932d7..ed564ec1ad547 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -15,9 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H -#include "AMDGPU.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c4eeb81c5133e..542a5f006c0f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -69,6 +69,21 @@ static cl::opt<unsigned> UnrollThresholdIf( cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(150), cl::Hidden); +static cl::opt<bool> UnrollRuntimeLocal( + "amdgpu-unroll-runtime-local", + cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> UseLegacyDA( + "amdgpu-use-legacy-divergence-analysis", + cl::desc("Enable legacy divergence analysis for AMDGPU"), + cl::init(false), cl::Hidden); + +static cl::opt<unsigned> UnrollMaxBlockToAnalyze( + "amdgpu-unroll-max-block-to-analyze", + cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), + cl::init(20), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -172,6 +187,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, (!isa<GlobalVariable>(GEP->getPointerOperand()) && !isa<Argument>(GEP->getPointerOperand()))) continue; + LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" + << *L << " due to LDS use.\n"); + UP.Runtime = UnrollRuntimeLocal; } // Check if GEP depends on a value defined by this loop itself. @@ -210,13 +228,22 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (UP.Threshold >= MaxBoost) return; } + + // If we got a GEP in a small BB from inner loop then increase max trip + // count to analyze for better estimation cost in unroll + if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze) + UP.MaxIterationsCountToAnalyze = 32; } } +void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + BaseT::getPeelingPreferences(L, SE, PP); +} unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { // The concept of vector registers doesn't really exist. Some packed vector // operations operate on the normal 32-bit registers. - return 256; + return MaxVGPRs; } unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { @@ -225,6 +252,13 @@ unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { return getHardwareNumberOfRegisters(Vec) >> 3; } +unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getRegClass(RCID); + unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32; + return getHardwareNumberOfRegisters(false) / NumVGPRs; +} + unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { return 32; } @@ -234,8 +268,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, - unsigned ChainSizeInBytes, - VectorType *VecTy) const { + unsigned ChainSizeInBytes, + VectorType *VecTy) const { unsigned VecRegBitWidth = VF * LoadSize; if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) // TODO: Support element-size less than 32bit? @@ -262,20 +296,16 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 512; } - if (AddrSpace == AMDGPUAS::FLAT_ADDRESS || - AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS) - return 128; - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - llvm_unreachable("unhandled address space"); + // Common to flat, global, local and region. Assume for unknown addrspace. + return 128; } bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { + Align Alignment, + unsigned AddrSpace) const { // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. @@ -287,17 +317,87 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, } bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { + Align Alignment, + unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { + Align Alignment, + unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } +// FIXME: Really we would like to issue multiple 128-bit loads and stores per +// iteration. Should we report a larger size and let it legalize? +// +// FIXME: Should we use narrower types for local/region, or account for when +// unaligned access is legal? +// +// FIXME: This could use fine tuning and microbenchmarks. +Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAddrSpace, + unsigned DestAddrSpace, + unsigned SrcAlign, + unsigned DestAlign) const { + unsigned MinAlign = std::min(SrcAlign, DestAlign); + + // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the + // hardware into byte accesses. If you assume all alignments are equally + // probable, it's more efficient on average to use short accesses for this + // case. + if (MinAlign == 2) + return Type::getInt16Ty(Context); + + // Not all subtargets have 128-bit DS instructions, and we currently don't + // form them by default. + if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS || + SrcAddrSpace == AMDGPUAS::REGION_ADDRESS || + DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS || + DestAddrSpace == AMDGPUAS::REGION_ADDRESS) { + return FixedVectorType::get(Type::getInt32Ty(Context), 2); + } + + // Global memory works best with 16-byte accesses. Private memory will also + // hit this, although they'll be decomposed. + return FixedVectorType::get(Type::getInt32Ty(Context), 4); +} + +void GCNTTIImpl::getMemcpyLoopResidualLoweringType( + SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, + unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign) const { + assert(RemainingBytes < 16); + + unsigned MinAlign = std::min(SrcAlign, DestAlign); + + if (MinAlign != 2) { + Type *I64Ty = Type::getInt64Ty(Context); + while (RemainingBytes >= 8) { + OpsOut.push_back(I64Ty); + RemainingBytes -= 8; + } + + Type *I32Ty = Type::getInt32Ty(Context); + while (RemainingBytes >= 4) { + OpsOut.push_back(I32Ty); + RemainingBytes -= 4; + } + } + + Type *I16Ty = Type::getInt16Ty(Context); + while (RemainingBytes >= 2) { + OpsOut.push_back(I16Ty); + RemainingBytes -= 2; + } + + Type *I8Ty = Type::getInt8Ty(Context); + while (RemainingBytes) { + OpsOut.push_back(I8Ty); + --RemainingBytes; + } +} + unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Disable unrolling if the loop is not vectorized. // TODO: Enable this again. @@ -339,6 +439,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, @@ -347,7 +448,11 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, const Instruction *CxtI) { EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + // FIXME: We're having to query the throughput cost so that the basic + // implementation tries to generate legalize and scalarization costs. Maybe + // we could hoist the scalarization code here? + return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, + Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); } @@ -455,24 +560,44 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * NElts * Cost; } break; + case ISD::FNEG: + // Use the backend' estimation. If fneg is not free each element will cost + // one additional instruction. + return TLI->isFNegFree(SLT) ? 0 : NElts; default: break; } - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, + Opd2Info, Opd1PropInfo, Opd2PropInfo); } -template <typename T> -int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<T *> Args, - FastMathFlags FMF, unsigned VF) { - if (ID != Intrinsic::fma) - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +// Return true if there's a potential benefit from using v2f16 instructions for +// an intrinsic, even if it requires nontrivial legalization. +static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::fma: // TODO: fmuladd + // There's a small benefit to using vector ops in the legalized code. + case Intrinsic::round: + return true; + default: + return false; + } +} + +int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { + if (ICA.getID() == Intrinsic::fabs) + return 0; + if (!intrinsicHasPackedVectorBenefit(ICA.getID())) + return BaseT::getIntrinsicInstrCost(ICA, CostKind); + + Type *RetTy = ICA.getReturnType(); EVT OrigTy = TLI->getValueType(DL, RetTy); if (!OrigTy.isSimple()) { - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(ICA, CostKind); } // Legalize the type. @@ -489,36 +614,34 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost()); -} + // TODO: Get more refined intrinsic costs? + unsigned InstRate = getQuarterRateInstrCost(); + if (ICA.getID() == Intrinsic::fma) { + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() + : getQuarterRateInstrCost(); + } -int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, - unsigned VF) { - return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF); + return LT.first * NElts * InstRate; } -int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { - return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); -} +unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind) { + if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) + return Opcode == Instruction::PHI ? 0 : 1; -unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { // XXX - For some reason this isn't called for switch. switch (Opcode) { case Instruction::Br: case Instruction::Ret: return 10; default: - return BaseT::getCFInstrCost(Opcode); + return BaseT::getCFInstrCost(Opcode, CostKind); } } -int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwise) { +int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + bool IsPairwise, + TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support @@ -526,15 +649,15 @@ int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, if (IsPairwise || !ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); + return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); } -int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, - bool IsPairwise, - bool IsUnsigned) { +int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support @@ -542,7 +665,8 @@ int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, if (IsPairwise || !ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getHalfRateInstrCost(); @@ -573,8 +697,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } } - - static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -601,6 +723,58 @@ static bool isArgPassedInSGPR(const Argument *A) { } } +/// Analyze if the results of inline asm are divergent. If \p Indices is empty, +/// this is analyzing the collective result of all output registers. Otherwise, +/// this is only querying a specific result index if this returns multiple +/// registers in a struct. +bool GCNTTIImpl::isInlineAsmSourceOfDivergence( + const CallInst *CI, ArrayRef<unsigned> Indices) const { + // TODO: Handle complex extract indices + if (Indices.size() > 1) + return true; + + const DataLayout &DL = CI->getModule()->getDataLayout(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + TargetLowering::AsmOperandInfoVector TargetConstraints = + TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI); + + const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; + + int OutputIdx = 0; + for (auto &TC : TargetConstraints) { + if (TC.Type != InlineAsm::isOutput) + continue; + + // Skip outputs we don't care about. + if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) + continue; + + TLI->ComputeConstraintToUse(TC, SDValue()); + + Register AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint( + TRI, TC.ConstraintCode, TC.ConstraintVT); + if (AssignedReg) { + // FIXME: This is a workaround for getRegForInlineAsmConstraint + // returning VS_32 + RC = TRI->getPhysRegClass(AssignedReg); + } + + // For AGPR constraints null is returned on subtargets without AGPRs, so + // assume divergent for null. + if (!RC || !TRI->isSGPRClass(RC)) + return true; + } + + return false; +} + +/// \returns true if the new GPU divergence analysis is enabled. +bool GCNTTIImpl::useGPUDivergenceAnalysis() const { + return !UseLegacyDA; +} + /// \returns true if the result of the value could potentially be /// different across workitems in a wavefront. bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { @@ -628,7 +802,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); // Assume all function calls are a source of divergence. - if (isa<CallInst>(V) || isa<InvokeInst>(V)) + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (CI->isInlineAsm()) + return isInlineAsmSourceOfDivergence(CI); + return true; + } + + // Assume all function calls are a source of divergence. + if (isa<InvokeInst>(V)) return true; return false; @@ -643,9 +824,44 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_icmp: case Intrinsic::amdgcn_fcmp: + case Intrinsic::amdgcn_ballot: + case Intrinsic::amdgcn_if_break: return true; } } + + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (CI->isInlineAsm()) + return !isInlineAsmSourceOfDivergence(CI); + return false; + } + + const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V); + if (!ExtValue) + return false; + + const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0)); + if (!CI) + return false; + + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef<unsigned> Indices = ExtValue->getIndices(); + return Indices.size() == 1 && Indices[0] == 1; + } + } + } + + // If we have inline asm returning mixed SGPR and VGPR results, we inferred + // divergent for the overall struct return. We need to override it in the + // case we're extracting an SGPR component here. + if (CI->isInlineAsm()) + return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); + return false; } @@ -666,8 +882,9 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, } } -bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( - IntrinsicInst *II, Value *OldV, Value *NewV) const { +Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, + Value *OldV, + Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: @@ -677,7 +894,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( case Intrinsic::amdgcn_ds_fmax: { const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4)); if (!IsVolatile->isZero()) - return false; + return nullptr; Module *M = II->getParent()->getParent()->getParent(); Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); @@ -685,7 +902,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); - return true; + return II; } case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: { @@ -695,20 +912,49 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace( LLVMContext &Ctx = NewV->getType()->getContext(); ConstantInt *NewVal = (TrueAS == NewAS) ? ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); - II->replaceAllUsesWith(NewVal); - II->eraseFromParent(); - return true; + return NewVal; + } + case Intrinsic::ptrmask: { + unsigned OldAS = OldV->getType()->getPointerAddressSpace(); + unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + Value *MaskOp = II->getArgOperand(1); + Type *MaskTy = MaskOp->getType(); + + bool DoTruncate = false; + if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) { + // All valid 64-bit to 32-bit casts work by chopping off the high + // bits. Any masking only clearing the low bits will also apply in the new + // address space. + if (DL.getPointerSizeInBits(OldAS) != 64 || + DL.getPointerSizeInBits(NewAS) != 32) + return nullptr; + + // TODO: Do we need to thread more context in here? + KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II); + if (Known.countMinLeadingOnes() < 32) + return nullptr; + + DoTruncate = true; + } + + IRBuilder<> B(II); + if (DoTruncate) { + MaskTy = B.getInt32Ty(); + MaskOp = B.CreateTrunc(MaskOp, MaskTy); + } + + return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, + {NewV, MaskOp}); } default: - return false; + return nullptr; } } -unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, + int Index, VectorType *SubTp) { if (ST->hasVOP3PInsts()) { - VectorType *VT = cast<VectorType>(Tp); - if (VT->getNumElements() == 2 && + if (cast<FixedVectorType>(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { // With op_sel VOP3P instructions freely can access the low half or high // half of a register, so any swizzle is free. @@ -724,7 +970,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, VT, Index, SubTp); } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, @@ -745,8 +991,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST); - AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); return CallerMode.isInlineCompatible(CalleeMode); } @@ -755,117 +1001,9 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, CommonTTI.getUnrollingPreferences(L, SE, UP); } -unsigned GCNTTIImpl::getUserCost(const User *U, - ArrayRef<const Value *> Operands) { - const Instruction *I = dyn_cast<Instruction>(U); - if (!I) - return BaseT::getUserCost(U, Operands); - - // Estimate different operations to be optimized out - switch (I->getOpcode()) { - case Instruction::ExtractElement: { - ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); - unsigned Idx = -1; - if (CI) - Idx = CI->getZExtValue(); - return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx); - } - case Instruction::InsertElement: { - ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2)); - unsigned Idx = -1; - if (CI) - Idx = CI->getZExtValue(); - return getVectorInstrCost(I->getOpcode(), I->getType(), Idx); - } - case Instruction::Call: { - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { - SmallVector<Value *, 4> Args(II->arg_operands()); - FastMathFlags FMF; - if (auto *FPMO = dyn_cast<FPMathOperator>(II)) - FMF = FPMO->getFastMathFlags(); - return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, - FMF); - } else { - return BaseT::getUserCost(U, Operands); - } - } - case Instruction::ShuffleVector: { - const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); - Type *Ty = Shuffle->getType(); - Type *SrcTy = Shuffle->getOperand(0)->getType(); - - // TODO: Identify and add costs for insert subvector, etc. - int SubIndex; - if (Shuffle->isExtractSubvectorMask(SubIndex)) - return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty); - - if (Shuffle->changesLength()) - return BaseT::getUserCost(U, Operands); - - if (Shuffle->isIdentity()) - return 0; - - if (Shuffle->isReverse()) - return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr); - - if (Shuffle->isSelect()) - return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr); - - if (Shuffle->isTranspose()) - return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr); - - if (Shuffle->isZeroEltSplat()) - return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr); - - if (Shuffle->isSingleSource()) - return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr); - - return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::AddrSpaceCast: { - return getCastInstrCost(I->getOpcode(), I->getType(), - I->getOperand(0)->getType(), I); - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::FNeg: { - return getArithmeticInstrCost(I->getOpcode(), I->getType(), - TTI::OK_AnyValue, TTI::OK_AnyValue, - TTI::OP_None, TTI::OP_None, Operands, I); - } - default: - break; - } - - return BaseT::getUserCost(U, Operands); +void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + CommonTTI.getPeelingPreferences(L, SE, PP); } unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { @@ -903,7 +1041,7 @@ unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { } bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, - unsigned Alignment, + Align Alignment, unsigned AddrSpace) const { // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context @@ -912,13 +1050,13 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, } bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, + Align Alignment, unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, - unsigned Alignment, + Align Alignment, unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } @@ -932,14 +1070,18 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 8; } -unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { +unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind) { + if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) + return Opcode == Instruction::PHI ? 0 : 1; + // XXX - For some reason this isn't called for switch. switch (Opcode) { case Instruction::Br: case Instruction::Ret: return 10; default: - return BaseT::getCFInstrCost(Opcode); + return BaseT::getCFInstrCost(Opcode, CostKind); } } @@ -970,3 +1112,8 @@ void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { CommonTTI.getUnrollingPreferences(L, SE, UP); } + +void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + CommonTTI.getPeelingPreferences(L, SE, PP); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 0b48f9f602b71..3364a9bcaccbb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -61,6 +61,9 @@ public: void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); }; class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { @@ -70,10 +73,11 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { friend BaseT; const GCNSubtarget *ST; - const AMDGPUTargetLowering *TLI; + const SITargetLowering *TLI; AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; bool HasFP32Denormals; + unsigned MaxVGPRs; const FeatureBitset InlineFeatureIgnoreList = { // Codegen control options which don't matter. @@ -133,13 +137,21 @@ public: TLI(ST->getTargetLowering()), CommonTTI(TM, F), IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), - HasFP32Denormals(ST->hasFP32Denormals(F)) { } + HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()), + MaxVGPRs(ST->getMaxNumVGPRs( + std::max(ST->getWavesPerEU(F).first, + ST->getWavesPerEUForWorkGroup( + ST->getFlatWorkGroupSizes(F).second)))) {} bool hasBranchDivergence() { return true; } + bool useGPUDivergenceAnalysis() const; void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); return TTI::PSK_FastHardware; @@ -147,6 +159,7 @@ public: unsigned getHardwareNumberOfRegisters(bool Vector) const; unsigned getNumberOfRegisters(bool Vector) const; + unsigned getNumberOfRegisters(unsigned RCID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -157,22 +170,30 @@ public: VectorType *VecTy) const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; - bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, - unsigned Alignment, + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, - unsigned Alignment, + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - + Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, + unsigned SrcAddrSpace, unsigned DestAddrSpace, + unsigned SrcAlign, unsigned DestAlign) const; + + void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut, + LLVMContext &Context, + unsigned RemainingBytes, + unsigned SrcAddrSpace, + unsigned DestAddrSpace, + unsigned SrcAlign, + unsigned DestAlign) const; unsigned getMaxInterleaveFactor(unsigned VF); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; int getArithmeticInstrCost( unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, @@ -180,7 +201,10 @@ public: ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr); - unsigned getCFInstrCost(unsigned Opcode); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); + + bool isInlineAsmSourceOfDivergence(const CallInst *CI, + ArrayRef<unsigned> Indices = {}) const; int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; @@ -196,13 +220,13 @@ public: bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const; - bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, - Value *OldV, Value *NewV) const; + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, + Value *NewV) const; unsigned getVectorSplitCost() { return 0; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, + VectorType *SubTp); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; @@ -211,23 +235,17 @@ public: int getInlinerVectorBonusPercent() { return 0; } - int getArithmeticReductionCost(unsigned Opcode, - Type *Ty, - bool IsPairwise); - template <typename T> - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<T *> Args, FastMathFlags FMF, - unsigned VF); - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1); - int getMinMaxReductionCost(Type *Ty, Type *CondTy, - bool IsPairwiseForm, - bool IsUnsigned); - unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); + int getArithmeticReductionCost( + unsigned Opcode, + VectorType *Ty, + bool IsPairwise, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); + + int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + int getMinMaxReductionCost( + VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); }; class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { @@ -245,28 +263,28 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), - CommonTTI(TM, F) {} + CommonTTI(TM, F) {} const R600Subtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); unsigned getHardwareNumberOfRegisters(bool Vec) const; unsigned getNumberOfRegisters(bool Vec) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; - bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, - unsigned Alignment, + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCFInstrCost(unsigned Opcode); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 191f603a66d6a..418296684d765 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } +static void removeDoneExport(Function &F) { + ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + Intrin->setArgOperand(6, BoolFalse); // done + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + Intrin->setArgOperand(4, BoolFalse); // done + } + } + } + } +} + static BasicBlock *unifyReturnBlockSet(Function &F, ArrayRef<BasicBlock *> ReturningBlocks, + bool InsertExport, const TargetTransformInfo &TTI, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + IRBuilder<> B(NewRetBlock); + + if (InsertExport) { + // Ensure that there's only one "done" export in the shader by removing the + // "done" bit set on the original final export. More than one "done" export + // can lead to undefined behavior. + removeDoneExport(F); + + Value *Undef = UndefValue::get(B.getFloatTy()); + B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, + { + B.getInt32(9), // target, SQ_EXP_NULL + B.getInt32(0), // enabled channels + Undef, Undef, Undef, Undef, // values + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { - ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + B.CreateRetVoid(); } else { // If the function doesn't return void... add a PHI node to the block... - PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), - "UnifiedRetVal"); - NewRetBlock->getInstList().push_back(PN); - ReturnInst::Create(F.getContext(), PN, NewRetBlock); + PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + assert(!InsertExport); + B.CreateRet(PN); } // Loop over all of the blocks, replacing the return instruction with an @@ -160,7 +195,11 @@ static BasicBlock *unifyReturnBlockSet(Function &F, bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - if (PDT.getRoots().size() <= 1) + + // If there's only one exit, we don't need to do anything, unless this is a + // pixel shader and that exit is an infinite loop, since we still have to + // insert an export in that case. + if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS) return false; LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>(); @@ -168,15 +207,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector<BasicBlock *, 4> ReturningBlocks; + SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks; SmallVector<BasicBlock *, 4> UnreachableBlocks; // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; - for (BasicBlock *BB : PDT.getRoots()) { + bool InsertExport = false; + + bool Changed = false; + for (BasicBlock *BB : PDT.roots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) ReturningBlocks.push_back(BB); + else + UniformlyReachedRetBlocks.push_back(BB); } else if (isa<UnreachableInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) UnreachableBlocks.push_back(BB); @@ -188,6 +233,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + + // For pixel shaders, the producer guarantees that an export is + // executed before each return instruction. However, if there is an + // infinite loop and we insert a return ourselves, we need to uphold + // that guarantee by inserting a null export. This can happen e.g. in + // an infinite loop with kill instructions, which is supposed to + // terminate. However, we don't need to do this if there is a non-void + // return value, since then there is an epilog afterwards which will + // still export. + // + // Note: In the case where only some threads enter the infinite loop, + // this can result in the null export happening redundantly after the + // original exports. However, The last "real" export happens after all + // the threads that didn't enter an infinite loop converged, which + // means that the only extra threads to execute the null export are + // threads that entered the infinite loop, and they only could've + // exited through being killed which sets their exec bit to 0. + // Therefore, unless there's an actual infinite loop, which can have + // invalid results, or there's a kill after the last export, which we + // assume the frontend won't do, this export will have the same exec + // mask as the last "real" export, and therefore the valid mask will be + // overwritten with the same value and will still be correct. Also, + // even though this forces an extra unnecessary export wait, we assume + // that this happens rare enough in practice to that we don't have to + // worry about performance. + if (F.getCallingConv() == CallingConv::AMDGPU_PS && + RetTy->isVoidTy()) { + InsertExport = true; + } + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } @@ -206,6 +281,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { BB->getTerminator()->eraseFromParent(); BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); } + Changed = true; } } @@ -224,6 +300,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { BB->getTerminator()->eraseFromParent(); BranchInst::Create(UnreachableBlock, BB); } + Changed = true; } if (!ReturningBlocks.empty()) { @@ -247,19 +324,32 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // actually reached here. ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock); ReturningBlocks.push_back(UnreachableBlock); + Changed = true; } } // Now handle return blocks. if (ReturningBlocks.empty()) - return false; // No blocks return + return Changed; // No blocks return - if (ReturningBlocks.size() == 1) - return false; // Already has a single return block + if (ReturningBlocks.size() == 1 && !InsertExport) + return Changed; // Already has a single return block const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + // Unify returning blocks. If we are going to insert the export it is also + // necessary to include blocks that are uniformly reached, because in addition + // to inserting the export the "done" bits on existing exports will be cleared + // and we do not want to end up with the normal export in a non-unified, + // uniformly reached block with the "done" bit cleared. + auto BlocksToUnify = std::move(ReturningBlocks); + if (InsertExport) { + BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(), + UniformlyReachedRetBlocks.end()); + } + + unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI, + "UnifiedReturnBlock"); return true; } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f3aa1a5823689..013b7a0cf25d1 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -163,6 +163,7 @@ public: ImmTyUNorm, ImmTyDA, ImmTyR128A16, + ImmTyA16, ImmTyLWE, ImmTyExpTgt, ImmTyExpCompr, @@ -277,6 +278,7 @@ public: isRegClass(AMDGPU::VReg_96RegClassID) || isRegClass(AMDGPU::VReg_128RegClassID) || isRegClass(AMDGPU::VReg_160RegClassID) || + isRegClass(AMDGPU::VReg_192RegClassID) || isRegClass(AMDGPU::VReg_256RegClassID) || isRegClass(AMDGPU::VReg_512RegClassID) || isRegClass(AMDGPU::VReg_1024RegClassID); @@ -315,6 +317,7 @@ public: bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } bool isR128A16() const { return isImmTy(ImmTyR128A16); } + bool isGFX10A16() const { return isImmTy(ImmTyA16); } bool isLWE() const { return isImmTy(ImmTyLWE); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } @@ -486,7 +489,7 @@ public: } bool isVSrcB16() const { - return isVCSrcF16() || isLiteralImm(MVT::i16); + return isVCSrcB16() || isLiteralImm(MVT::i16); } bool isVSrcV2B16() const { @@ -654,7 +657,7 @@ public: bool isSendMsg() const; bool isSwizzle() const; bool isSMRDOffset8() const; - bool isSMRDOffset20() const; + bool isSMEMOffset() const; bool isSMRDLiteralOffset() const; bool isDPP8() const; bool isDPPCtrl() const; @@ -847,6 +850,7 @@ public: case ImmTyUNorm: OS << "UNorm"; break; case ImmTyDA: OS << "DA"; break; case ImmTyR128A16: OS << "R128A16"; break; + case ImmTyA16: OS << "A16"; break; case ImmTyLWE: OS << "LWE"; break; case ImmTyOff: OS << "Off"; break; case ImmTyExpTgt: OS << "ExpTgt"; break; @@ -1062,17 +1066,20 @@ private: bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1); - bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, - unsigned& RegNum, unsigned& RegWidth); - unsigned ParseRegularReg(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth); - unsigned ParseSpecialReg(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth); - unsigned ParseRegList(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth); + bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + bool RestoreOnFailure = false); + bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens); + unsigned ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens); + unsigned ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens); + unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens); bool ParseRegRange(unsigned& Num, unsigned& Width); unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, @@ -1157,6 +1164,10 @@ public: return AMDGPU::hasPackedD16(getSTI()); } + bool hasGFX10A16() const { + return AMDGPU::hasGFX10A16(getSTI()); + } + bool isSI() const { return AMDGPU::isSI(getSTI()); } @@ -1177,6 +1188,10 @@ public: return AMDGPU::isGFX10(getSTI()); } + bool isGFX10_BEncoding() const { + return AMDGPU::isGFX10_BEncoding(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -1226,8 +1241,12 @@ public: bool isForcedSDWA() const { return ForcedSDWA; } ArrayRef<unsigned> getMatchedVariants() const; - std::unique_ptr<AMDGPUOperand> parseRegister(); + std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false); + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc, + bool RestoreOnFailure); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; @@ -1311,9 +1330,11 @@ private: void errorExpTgt(); OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val); SMLoc getFlatOffsetLoc(const OperandVector &Operands) const; + SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const; bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands); bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands); + bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands); bool validateSOPLiteral(const MCInst &Inst) const; bool validateConstantBusLimitations(const MCInst &Inst); bool validateEarlyClobberLimitations(const MCInst &Inst); @@ -1329,6 +1350,7 @@ private: bool validateOpSel(const MCInst &Inst); bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst) const; + bool validateMAIAccWrite(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -1390,7 +1412,7 @@ public: AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; - AMDGPUOperand::Ptr defaultSMRDOffset20() const; + AMDGPUOperand::Ptr defaultSMEMOffset() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; AMDGPUOperand::Ptr defaultFlatOffset() const; @@ -1524,6 +1546,16 @@ static bool isSafeTruncation(int64_t Val, unsigned Size) { return isUIntN(Size, Val) || isIntN(Size, Val); } +static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) { + if (VT.getScalarType() == MVT::i16) { + // FP immediate values are broken. + return isInlinableIntLiteral(Val); + } + + // f16/v2f16 operands work correctly for all values. + return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi); +} + bool AMDGPUOperand::isInlinableImm(MVT type) const { // This is a hack to enable named inline values like @@ -1555,9 +1587,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { return false; if (type.getScalarSizeInBits() == 16) { - return AMDGPU::isInlinableLiteral16( + return isInlineableLiteralOp16( static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()), - AsmParser->hasInv2PiInlineImm()); + type, AsmParser->hasInv2PiInlineImm()); } // Check if single precision literal is inlinable @@ -1577,9 +1609,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { } if (type.getScalarSizeInBits() == 16) { - return AMDGPU::isInlinableLiteral16( + return isInlineableLiteralOp16( static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()), - AsmParser->hasInv2PiInlineImm()); + type, AsmParser->hasInv2PiInlineImm()); } return AMDGPU::isInlinableLiteral32( @@ -1901,6 +1933,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 3: return AMDGPU::VReg_96RegClassID; case 4: return AMDGPU::VReg_128RegClassID; case 5: return AMDGPU::VReg_160RegClassID; + case 6: return AMDGPU::VReg_192RegClassID; case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; case 32: return AMDGPU::VReg_1024RegClassID; @@ -1919,7 +1952,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { default: return -1; case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; + case 3: return AMDGPU::SGPR_96RegClassID; case 4: return AMDGPU::SGPR_128RegClassID; + case 5: return AMDGPU::SGPR_160RegClassID; + case 6: return AMDGPU::SGPR_192RegClassID; case 8: return AMDGPU::SGPR_256RegClassID; case 16: return AMDGPU::SGPR_512RegClassID; } @@ -1928,7 +1964,11 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { default: return -1; case 1: return AMDGPU::AGPR_32RegClassID; case 2: return AMDGPU::AReg_64RegClassID; + case 3: return AMDGPU::AReg_96RegClassID; case 4: return AMDGPU::AReg_128RegClassID; + case 5: return AMDGPU::AReg_160RegClassID; + case 6: return AMDGPU::AReg_192RegClassID; + case 8: return AMDGPU::AReg_256RegClassID; case 16: return AMDGPU::AReg_512RegClassID; case 32: return AMDGPU::AReg_1024RegClassID; } @@ -1975,12 +2015,13 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("tma_hi", AMDGPU::TMA_HI) .Case("tba_lo", AMDGPU::TBA_LO) .Case("tba_hi", AMDGPU::TBA_HI) + .Case("pc", AMDGPU::PC_REG) .Case("null", AMDGPU::SGPR_NULL) .Default(AMDGPU::NoRegister); } bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, - SMLoc &EndLoc) { + SMLoc &EndLoc, bool RestoreOnFailure) { auto R = parseRegister(); if (!R) return true; assert(R->isReg()); @@ -1990,6 +2031,25 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, return false; } +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { + return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false); +} + +OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) { + bool Result = + ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true); + bool PendingErrors = getParser().hasPendingError(); + getParser().clearPendingErrors(); + if (PendingErrors) + return MatchOperand_ParseFail; + if (Result) + return MatchOperand_NoMatch; + return MatchOperand_Success; +} + bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, RegisterKind RegKind, unsigned Reg1) { switch (RegKind) { @@ -2166,31 +2226,31 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) { return true; } -unsigned -AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth) { +unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind, + unsigned &RegNum, unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens) { assert(isToken(AsmToken::Identifier)); unsigned Reg = getSpecialRegForName(getTokenStr()); if (Reg) { RegNum = 0; RegWidth = 1; RegKind = IS_SPECIAL; + Tokens.push_back(getToken()); lex(); // skip register name } return Reg; } -unsigned -AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth) { +unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, + unsigned &RegNum, unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens) { assert(isToken(AsmToken::Identifier)); StringRef RegName = getTokenStr(); const RegInfo *RI = getRegularRegInfo(RegName); if (!RI) return AMDGPU::NoRegister; + Tokens.push_back(getToken()); lex(); // skip register name RegKind = RI->Kind; @@ -2209,10 +2269,9 @@ AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind, return getRegularReg(RegKind, RegNum, RegWidth); } -unsigned -AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, - unsigned &RegNum, - unsigned &RegWidth) { +unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum, + unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens) { unsigned Reg = AMDGPU::NoRegister; if (!trySkipToken(AsmToken::LBrac)) @@ -2229,7 +2288,8 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, RegisterKind NextRegKind; unsigned NextReg, NextRegNum, NextRegWidth; - if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth)) + if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth, + Tokens)) return AMDGPU::NoRegister; if (NextRegWidth != 1) return AMDGPU::NoRegister; @@ -2248,24 +2308,40 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, return Reg; } -bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, - unsigned &Reg, - unsigned &RegNum, - unsigned &RegWidth) { +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + SmallVectorImpl<AsmToken> &Tokens) { Reg = AMDGPU::NoRegister; if (isToken(AsmToken::Identifier)) { - Reg = ParseSpecialReg(RegKind, RegNum, RegWidth); + Reg = ParseSpecialReg(RegKind, RegNum, RegWidth, Tokens); if (Reg == AMDGPU::NoRegister) - Reg = ParseRegularReg(RegKind, RegNum, RegWidth); + Reg = ParseRegularReg(RegKind, RegNum, RegWidth, Tokens); } else { - Reg = ParseRegList(RegKind, RegNum, RegWidth); + Reg = ParseRegList(RegKind, RegNum, RegWidth, Tokens); } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg); } +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, + unsigned &RegNum, unsigned &RegWidth, + bool RestoreOnFailure) { + Reg = AMDGPU::NoRegister; + + SmallVector<AsmToken, 1> Tokens; + if (ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, Tokens)) { + if (RestoreOnFailure) { + while (!Tokens.empty()) { + getLexer().UnLex(Tokens.pop_back_val()); + } + } + return true; + } + return false; +} + Optional<StringRef> AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) { switch (RegKind) { @@ -2314,7 +2390,8 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind, return true; } -std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { +std::unique_ptr<AMDGPUOperand> +AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { const auto &Tok = Parser.getTok(); SMLoc StartLoc = Tok.getLoc(); SMLoc EndLoc = Tok.getEndLoc(); @@ -2758,16 +2835,22 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm()); case 2: { const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType; + if (OperandType == AMDGPU::OPERAND_REG_IMM_INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_INT16) + return AMDGPU::isInlinableIntLiteral(Val); + if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || - OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || + OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16) + return AMDGPU::isInlinableIntLiteralV216(Val); + + if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || - OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 || - OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) { + OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); - } else { - return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); - } + + return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); } default: llvm_unreachable("invalid operand size"); @@ -3085,6 +3168,30 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) { return !isSGPR(mc2PseudoReg(Reg), TRI); } +bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + + if (Opc != AMDGPU::V_ACCVGPR_WRITE_B32_vi) + return true; + + const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + assert(Src0Idx != -1); + + const MCOperand &Src0 = Inst.getOperand(Src0Idx); + if (!Src0.isReg()) + return true; + + auto Reg = Src0.getReg(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (isSGPR(mc2PseudoReg(Reg), TRI)) { + Error(getLoc(), "source operand must be either a VGPR or an inline constant"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -3335,6 +3442,46 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, return true; } +SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const { + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + if (Op.isSMEMOffset()) + return Op.getStartLoc(); + } + return getLoc(); +} + +bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst, + const OperandVector &Operands) { + if (isCI() || isSI()) + return true; + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & SIInstrFlags::SMRD) == 0) + return true; + + auto Opcode = Inst.getOpcode(); + auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset); + if (OpNum == -1) + return true; + + const auto &Op = Inst.getOperand(OpNum); + if (!Op.isImm()) + return true; + + uint64_t Offset = Op.getImm(); + bool IsBuffer = AMDGPU::getSMEMIsBuffer(Opcode); + if (AMDGPU::isLegalSMRDEncodedUnsignedOffset(getSTI(), Offset) || + AMDGPU::isLegalSMRDEncodedSignedOffset(getSTI(), Offset, IsBuffer)) + return true; + + Error(getSMEMOffsetLoc(Operands), + (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" : + "expected a 21-bit signed offset"); + + return false; +} + bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const { unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); @@ -3512,6 +3659,12 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateFlatOffset(Inst, Operands)) { return false; } + if (!validateSMEMOffset(Inst, Operands)) { + return false; + } + if (!validateMAIAccWrite(Inst)) { + return false; + } return true; } @@ -3556,7 +3709,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, getSTI()); + Out.emitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: @@ -4307,19 +4460,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { if (Size > LocalMemorySize) return Error(SizeLoc, "size is too large"); - int64_t Align = 4; + int64_t Alignment = 4; if (getLexer().is(AsmToken::Comma)) { Lex(); SMLoc AlignLoc = getLexer().getLoc(); - if (getParser().parseAbsoluteExpression(Align)) + if (getParser().parseAbsoluteExpression(Alignment)) return true; - if (Align < 0 || !isPowerOf2_64(Align)) + if (Alignment < 0 || !isPowerOf2_64(Alignment)) return Error(AlignLoc, "alignment must be a power of two"); // Alignment larger than the size of LDS is possible in theory, as long // as the linker manages to place to symbol at address 0, but we do want // to make sure the alignment fits nicely into a 32-bit integer. - if (Align >= 1u << 31) + if (Alignment >= 1u << 31) return Error(AlignLoc, "alignment is too large"); } @@ -4331,7 +4484,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { if (!Symbol->isUndefined()) return Error(NameLoc, "invalid symbol redefinition"); - getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align); + getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align(Alignment)); return false; } @@ -4650,9 +4803,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, case AsmToken::Identifier: { StringRef Tok = Parser.getTok().getString(); if (Tok == Name) { - if (Tok == "r128" && isGFX9()) + if (Tok == "r128" && !hasMIMG_R128()) Error(S, "r128 modifier is not supported on this GPU"); - if (Tok == "a16" && !isGFX9() && !isGFX10()) + if (Tok == "a16" && !isGFX9() && !hasGFX10A16()) Error(S, "a16 modifier is not supported on this GPU"); Bit = 1; Parser.Lex(); @@ -4672,6 +4825,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC) return MatchOperand_ParseFail; + if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) + ImmTy = AMDGPUOperand::ImmTyR128A16; + Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy)); return MatchOperand_Success; } @@ -5987,6 +6143,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); + if (IsGFX10) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); if (!IsGFX10) @@ -6006,8 +6164,8 @@ bool AMDGPUOperand::isSMRDOffset8() const { return isImm() && isUInt<8>(getImm()); } -bool AMDGPUOperand::isSMRDOffset20() const { - return isImm() && isUInt<20>(getImm()); +bool AMDGPUOperand::isSMEMOffset() const { + return isImm(); // Offset range is checked later by validator. } bool AMDGPUOperand::isSMRDLiteralOffset() const { @@ -6020,7 +6178,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMEMOffset() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } @@ -6096,7 +6254,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, {"r128", AMDGPUOperand::ImmTyR128A16, true, nullptr}, - {"a16", AMDGPUOperand::ImmTyR128A16, true, nullptr}, + {"a16", AMDGPUOperand::ImmTyA16, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, @@ -6499,7 +6657,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { std::string Token; if (getLexer().is(AsmToken::Integer)) { SMLoc Loc = getLexer().getTok().getEndLoc(); - Token = getLexer().getTok().getString(); + Token = std::string(getLexer().getTok().getString()); Parser.Lex(); if (getLexer().getTok().getLoc() != Loc) return MatchOperand_ParseFail; @@ -7032,6 +7190,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand; case MCK_AttrChan: return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand; + case MCK_ImmSMEMOffset: + return Operand.isSMEMOffset() ? Match_Success : Match_InvalidOperand; case MCK_SReg_64: case MCK_SReg_64_XEXEC: // Null is defined as a 32-bit register but diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 691aff4ecbb8a..fa42ddc54b565 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1,4 +1,4 @@ -//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===// +//===-- BUFInstructions.td - Buffer Instruction Definitions ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -374,7 +374,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> : let AsmMatchConverter = ""; let hasSideEffects = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; // Set everything to 0. let offen = 0; @@ -1003,6 +1004,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64 >; +let SubtargetPredicate = HasGFX10_BEncoding in +defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32 +>; + let SubtargetPredicate = isGFX8GFX9 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; } @@ -1152,22 +1158,6 @@ let SubtargetPredicate = isGFX10Plus in { // MUBUF Patterns //===----------------------------------------------------------------------===// -def extract_glc : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8); -}]>; - -def extract_slc : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); -}]>; - -def extract_dlc : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); -}]>; - -def extract_swz : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); -}]>; - //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1177,24 +1167,24 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1202,9 +1192,9 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; } @@ -1221,6 +1211,7 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_X let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. @@ -1228,6 +1219,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">; @@ -1256,7 +1248,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1264,8 +1256,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $auxiliary), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1273,8 +1265,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $auxiliary), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1283,9 +1275,9 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) - $vdata, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + getVregSrcForVT<vt>.ret:$vdata, + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1303,6 +1295,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMA let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. @@ -1310,6 +1303,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">; @@ -1338,37 +1332,37 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, + timm:$offset, timm:$cachepolicy, 0)), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) + getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, + timm:$offset, timm:$cachepolicy, timm)), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_slc $cachepolicy)) + (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset, + i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in, + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset, + i32:$soffset, timm:$offset, timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) - $vdata_in, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + getVregSrcForVT<vt>.ret:$vdata_in, + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_slc $cachepolicy)) >; } @@ -1384,6 +1378,7 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">; defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">; defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">; +defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">; defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1434,19 +1429,20 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, >; } +let SubtargetPredicate = HasAtomicFaddInsts in { defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; +} def : GCNPat< (SIbuffer_atomic_cmpswap - i32:$data, i32:$cmp, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, + timm:$offset, timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), - sub0) + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (extract_slc $cachepolicy)), sub0) >; def : GCNPat< @@ -1456,8 +1452,8 @@ def : GCNPat< timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), sub0) >; @@ -1468,8 +1464,8 @@ def : GCNPat< timm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), sub0) >; @@ -1480,9 +1476,9 @@ def : GCNPat< timm:$cachepolicy, timm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN - (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), + (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), sub0) >; @@ -1584,7 +1580,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; foreach vt = Reg32Types.types in { -defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, vt, load_private>; } defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>; @@ -1692,8 +1688,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), - (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1701,8 +1697,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm)), - (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1710,8 +1706,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), - (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1720,9 +1716,9 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $format), + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1739,12 +1735,14 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW"> let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. @@ -1754,8 +1752,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), - (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1763,8 +1761,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm), - (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1772,8 +1770,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), - (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $format), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, + (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1782,9 +1780,9 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) - $vdata, - (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), + getVregSrcForVT<vt>.ret:$vdata, + (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), (extract_swz $auxiliary)) >; @@ -1801,12 +1799,14 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZ let SubtargetPredicate = HasUnpackedD16VMem in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. @@ -1888,8 +1888,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">; } - multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : - MUBUF_Real_AllAddr_gfx10<op> { + multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> { def _BOTHEN_RTN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; def _IDXEN_RTN_gfx10 : @@ -1899,6 +1898,8 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { def _OFFSET_RTN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; } + multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> : + MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>; } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; @@ -2063,6 +2064,8 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>; defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>; + defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>; diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td index 1a526675164a0..f4ddbf1131c34 100644 --- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -50,16 +50,19 @@ def COS_cm : COS_Common<0x8E>; def : RsqPat<RECIPSQRT_IEEE_cm, f32>; +def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>; + def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; // RECIP_UINT emulation for Cayman -// The multiplication scales from [0,1] to the unsigned integer range +// The multiplication scales from [0,1) to the unsigned integer range, +// rounding down a bit to avoid unwanted overflow. def : R600Pat < (AMDGPUurecip i32:$src0), (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), - (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) + (MOV_IMM_I32 CONST.FP_4294966784))) >; def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { @@ -70,8 +73,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { -def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; - class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> : CF_MEM_RAT_CACHELESS <0x14, 0, mask, (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index fe7faca8b1570..beb01b1abf0f8 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1,4 +1,4 @@ -//===-- DSInstructions.td - DS Instruction Defintions ---------------------===// +//===-- DSInstructions.td - DS Instruction Definitions --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -388,7 +388,12 @@ defm DS_MAX_U32 : DS_1A1D_NORET_mc<"ds_max_u32">; defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">; defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">; defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">; + +let SubtargetPredicate = HasLDSFPAtomics in { defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">; +} + +// FIXME: Are these really present pre-gfx8? defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">; defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">; @@ -443,7 +448,10 @@ defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>; defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>; defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">; + +let SubtargetPredicate = HasLDSFPAtomics in { defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">; +} defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; @@ -497,6 +505,7 @@ def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; } +let SubtargetPredicate = HasDsSrc2Insts in { def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">; @@ -529,6 +538,7 @@ def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">; def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; +} // End SubtargetPredicate = HasDsSrc2Insts let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>; @@ -609,10 +619,12 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", int_amdgcn_ds_bpermute>; } -def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; - } // let SubtargetPredicate = isGFX8Plus +let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in { +def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; +} + //===----------------------------------------------------------------------===// // DS Patterns //===----------------------------------------------------------------------===// @@ -725,7 +737,7 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">; -foreach vt = VGPR_32.RegTypes in { +foreach vt = Reg32Types.types in { defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">; } @@ -737,31 +749,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>; def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>; } - -class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat < - (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), +class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < + (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), (inst $ptr, $offset0, $offset1, (i1 0)) >; -class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat< - (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), - (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), - (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, +class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat< + (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), + (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1, (i1 0)) >; -// v2i32 loads are split into i32 loads on SI during lowering, due to a bug -// related to bounds checking. -let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { -def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>; -def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>; -} +multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> { + let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { + def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>; + def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>; + } -let OtherPredicates = [NotLDSRequiresM0Init] in { -def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>; -def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>; + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>; + def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>; + } } +// v2i32 loads are split into i32 loads on SI during lowering, due to a bug +// related to bounds checking. +foreach vt = VReg_64.RegTypes in { +defm : DS64Bit4ByteAlignedPat_mc<vt>; +} let AddedComplexity = 100 in { @@ -826,9 +842,12 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">; defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">; defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">; defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">; + +let SubtargetPredicate = HasLDSFPAtomics in { defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">; defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">; defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">; +} // 64-bit atomics. defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 419513bdc2482..9c2f2e7eecd14 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -18,7 +18,6 @@ #include "Disassembler/AMDGPUDisassembler.h" #include "AMDGPU.h" -#include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "TargetInfo/AMDGPUTargetInfo.h" @@ -101,6 +100,18 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } +static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); + int64_t Offset; + if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets. + Offset = Imm & 0xFFFFF; + } else { // GFX9+ supports 21-bit signed offsets. + Offset = SignExtend64<21>(Imm); + } + return addOperand(Inst, MCOperand::createImm(Offset)); +} + static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, const void *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); @@ -285,6 +296,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { + Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address); + if (Res) { + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) + == -1) + break; + if (convertDPP8Inst(MI) == MCDisassembler::Success) + break; + MI = MCInst(); // clear + } + } + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; @@ -334,6 +357,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { + Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address); + if (Res) break; + } + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); if (Res) break; @@ -351,13 +379,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); } while (false); - if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral || - !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) { - MaxInstBytesNum = 8; - Bytes = Bytes_.slice(0, MaxInstBytesNum); - eatBytes<uint64_t>(Bytes); - } - if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 || MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 || @@ -931,6 +952,7 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const { return AGPR_32RegClassID; case OPW64: return AReg_64RegClassID; case OPW128: return AReg_128RegClassID; + case OPW256: return AReg_256RegClassID; case OPW512: return AReg_512RegClassID; case OPW1024: return AReg_1024RegClassID; } @@ -1202,8 +1224,6 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &/*cStream*/, int64_t Value, uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/, uint64_t /*InstSize*/) { - using SymbolInfoTy = std::tuple<uint64_t, StringRef, uint8_t>; - using SectionSymbolsTy = std::vector<SymbolInfoTy>; if (!IsBranch) { return false; @@ -1215,11 +1235,11 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, auto Result = std::find_if(Symbols->begin(), Symbols->end(), [Value](const SymbolInfoTy& Val) { - return std::get<0>(Val) == static_cast<uint64_t>(Value) - && std::get<2>(Val) == ELF::STT_NOTYPE; + return Val.Addr == static_cast<uint64_t>(Value) + && Val.Type == ELF::STT_NOTYPE; }); if (Result != Symbols->end()) { - auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result)); + auto *Sym = Ctx.getOrCreateSymbol(Result->Name); const auto *Add = MCSymbolRefExpr::create(Sym, Ctx); Inst.addOperand(MCOperand::createExpr(Add)); return true; diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 792e26d21f98d..97104a242d8c1 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -69,11 +69,11 @@ multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> { def _RTN: CF_MEM_RAT <op_ret, 0, 0xf, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs R600_Reg128:$out_gpr), - name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >; + name # "_RTN" # " $rw_gpr, $index_gpr", [] >; def _NORET: CF_MEM_RAT <op_noret, 0, 0xf, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs R600_Reg128:$out_gpr), - name ## " $rw_gpr, $index_gpr", [] >; + name # " $rw_gpr, $index_gpr", [] >; } } @@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; def : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>; + def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; -def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; } // End SubtargetPredicate = isEG //===----------------------------------------------------------------------===// @@ -421,6 +422,7 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", def : UMad24Pat<MULADD_UINT24_eg>; def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; +def : FSHRPattern <BIT_ALIGN_INT_eg>; def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; @@ -570,7 +572,7 @@ class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> : } class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : - R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> { + R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name#"_RET", pattern, "OQAP, "> { let BaseOp = name; let usesCustomInserter = 1; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 2057cac346d45..69facada2e964 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1,4 +1,4 @@ -//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===// +//===-- FLATInstructions.td - FLAT Instruction Definitions ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -100,7 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : !if(ps.is_flat_scratch, 0b01, 0)); // Signed offset. Highest bit ignored for flat and treated as 12-bit - // unsigned for flat acceses. + // unsigned for flat accesses. bits<13> offset; bits<1> nv = 0; // XXX - What does this actually do? @@ -175,7 +175,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, @@ -183,8 +183,27 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha } } +class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass, + bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo< + opName, + (outs regClass:$vdst), + !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), + " $vdst, $saddr$offset$glc$slc$dlc"> { + let is_flat_global = 1; + let has_data = 0; + let mayLoad = 1; + let has_vaddr = 0; + let has_saddr = 1; + let enabled_saddr = 1; + let maybeAtomic = 1; + + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); +} + multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo<opName, regClass, 1>, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, @@ -192,6 +211,24 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { } } +class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, + bit HasSignedOffset = 0> : FLAT_Pseudo< + opName, + (outs), + !con( + (ins vdataClass:$vdata, SReg_64:$saddr), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vdata, $saddr$offset$glc$slc$dlc"> { + let is_flat_global = 1; + let mayLoad = 0; + let mayStore = 1; + let has_vdst = 0; + let has_vaddr = 0; + let has_saddr = 1; + let enabled_saddr = 1; + let maybeAtomic = 1; +} + class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, bit EnableSaddr = 0>: FLAT_Pseudo< opName, @@ -279,6 +316,7 @@ multiclass FLAT_Atomic_Pseudo< AtomicNoRet <opName, 0> { let PseudoInstr = NAME; let FPAtomic = isFP; + let AddedComplexity = -1; // Prefer global atomics if available } def _RTN : FLAT_AtomicRet_Pseudo <opName, @@ -290,6 +328,7 @@ multiclass FLAT_Atomic_Pseudo< GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1>{ let FPAtomic = isFP; + let AddedComplexity = -1; // Prefer global atomics if available } } @@ -367,10 +406,12 @@ multiclass FLAT_Global_Atomic_Pseudo< SDPatternOperator atomic_rtn = null_frag, SDPatternOperator atomic_no_rtn = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc> : - FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>, - FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>; - + RegisterClass data_rc = vdst_rc> { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { + defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>; + defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>; + } +} //===----------------------------------------------------------------------===// // Flat Instructions @@ -507,7 +548,6 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isGFX7GFX10 -let SubtargetPredicate = HasFlatGlobalInsts in { defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -523,6 +563,8 @@ defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16" defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; +let OtherPredicates = [HasGFX10_BEncoding] in +def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; @@ -530,6 +572,8 @@ defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; +let OtherPredicates = [HasGFX10_BEncoding] in +def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; @@ -615,9 +659,12 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64>; + +let SubtargetPredicate = HasGFX10_BEncoding in +defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", + VGPR_32, i32, atomic_csub_global_32>; } // End is_flat_global = 1 -} // End SubtargetPredicate = HasFlatGlobalInsts let SubtargetPredicate = HasFlatScratchInsts in { @@ -912,6 +959,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>; @@ -1212,6 +1260,9 @@ multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> : FLAT_Real_RTN_gfx10<op>, FLAT_Real_SADDR_RTN_gfx10<op>; +multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> : + FLAT_Real_RTN_gfx10<op>, + FLAT_Real_SADDR_RTN_gfx10<op>; // ENC_FLAT. defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; @@ -1297,6 +1348,7 @@ defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>; defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>; defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>; +defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>; defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>; defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>; defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>; @@ -1325,7 +1377,8 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>; defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; - +defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x016>; +defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x017>; // ENC_FLAT_SCRATCH. defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 10e2c3a263f17..719a968b83147 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -105,6 +105,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA); + } + private: int getDPPOp(unsigned Op) const; }; @@ -168,7 +173,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, - OrigMI.getDebugLoc(), TII->get(DPPOp)); + OrigMI.getDebugLoc(), TII->get(DPPOp)) + .setMIFlags(OrigMI.getFlags()); + bool Fail = false; do { auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); @@ -506,15 +513,32 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0); + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1] + LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + break; + } + + assert(Src0 && "Src1 without Src0?"); + if (Src1 && Src1->isIdenticalTo(*Src0)) { + assert(Src1->isReg()); + LLVM_DEBUG( + dbgs() + << " " << OrigMI + << " failed: DPP register is used more than once per instruction\n"); + break; + } + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); - if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (Use == Src0) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ)) { DPPMIs.push_back(DPPInst); Rollback = false; } - } else if (OrigMI.isCommutable() && - Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + } else { + assert(Use == Src1 && OrigMI.isCommutable()); // by check [1] auto *BB = OrigMI.getParent(); auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); BB->insert(OrigMI, NewMI); @@ -528,8 +552,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } else LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); NewMI->eraseFromParent(); - } else - LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + } if (Rollback) break; OrigMIs.push_back(&OrigMI); @@ -562,8 +585,6 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = ST.getInstrInfo(); - assert(MRI->isSSA() && "Must be run on SSA"); - bool Changed = false; for (auto &MBB : MF) { for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 3ef5a77af45e2..8482dbfec250b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -228,11 +228,6 @@ void GCNHazardRecognizer::processBundle() { CurrCycleInstr = nullptr; } -unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { - IsHazardRecognizerMode = false; - return PreEmitNoopsCommon(SU->getInstr()); -} - unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { IsHazardRecognizerMode = true; CurrCycleInstr = MI; @@ -486,6 +481,14 @@ void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { addRegsToSet(TRI, MI.uses(), ClauseUses); } +static bool breaksSMEMSoftClause(MachineInstr *MI) { + return !SIInstrInfo::isSMRD(*MI); +} + +static bool breaksVMEMSoftClause(MachineInstr *MI) { + return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI); +} + int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // SMEM soft clause are only present on VI+, and only matter if xnack is // enabled. @@ -512,7 +515,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { if (!MI) break; - if (IsSMRD != SIInstrInfo::isSMRD(*MI)) + if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) break; addClauseInst(*MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 6aa2e70dfbfb9..cd17f2755bd10 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -105,7 +105,6 @@ public: void EmitInstruction(MachineInstr *MI) override; HazardType getHazardType(SUnit *SU, int Stalls) override; void EmitNoop() override; - unsigned PreEmitNoops(SUnit *SU) override; unsigned PreEmitNoops(MachineInstr *) override; unsigned PreEmitNoopsCommon(MachineInstr *); void AdvanceCycle() override; diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 90ab6a14ce20d..75a02c8390343 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the class GCNIterativeScheduler. +/// +//===----------------------------------------------------------------------===// #include "GCNIterativeScheduler.h" #include "AMDGPUSubtarget.h" diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h index e6f83914af5ba..a0d4f432aa48d 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -5,6 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the class GCNIterativeScheduler, which uses an iterative +/// approach to find a best schedule for GCN architecture. It basically makes +/// use of various lightweight schedules, scores them, chooses best one based on +/// their scores, and finally implements the chosen one. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H #define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H diff --git a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index c469cf290e264..884b2e17289c5 100644 --- a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -5,6 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines and imlements the class GCNMinRegScheduler, which +/// implements an experimental, simple scheduler whose main goal is to learn +/// ways about consuming less possible registers for a region. +/// +//===----------------------------------------------------------------------===// #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" @@ -207,9 +214,8 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum << ")'s non-ready successors of " << Priority << " priority in ready queue: "); - const auto SetEnd = Set.end(); for (auto &C : RQ) { - if (Set.find(C.SU) != SetEnd) { + if (Set.count(C.SU)) { C.Priority = Priority; LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); } diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index f6023f3a40a27..57346087d0175 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -286,8 +286,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { } Intervals.push_back(LI); OrigRegs.push_back(VRM->getPhys(Reg)); - MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); - MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); + if (LI->empty()) { + // The address input is undef, so it doesn't contribute to the relevant + // range. Seed a reasonable index range if required. + if (I == 0) + MinInd = MaxInd = LIS->getInstructionIndex(*MI); + continue; + } + MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); + MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); } if (Intervals.empty()) diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index b926041afb2fe..17e6098d880d5 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -183,3 +183,7 @@ def : ProcessorModel<"gfx1011", GFX10SpeedModel, def : ProcessorModel<"gfx1012", GFX10SpeedModel, FeatureISAVersion10_1_2.Features >; + +def : ProcessorModel<"gfx1030", GFX10SpeedModel, + FeatureISAVersion10_3_0.Features +>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp index 76593bc0e5aca..98d971630ca4f 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -168,13 +168,15 @@ private: // 8 banks for SGPRs. // Registers already processed and recorded in RegsUsed are excluded. // If Bank is not -1 assume Reg:SubReg to belong to that Bank. - unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); + uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); - // Return number of stalls in the instructions. - // UsedBanks has bits set for the banks used by all operands. - // If Reg and Bank provided substitute the Reg with the Bank. - unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, - unsigned Reg = AMDGPU::NoRegister, int Bank = -1); + // Analyze one instruction returning the number of stalls and a mask of the + // banks used by all operands. + // If Reg and Bank are provided, assume all uses of Reg will be replaced with + // a register chosen from Bank. + std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI, + unsigned Reg = AMDGPU::NoRegister, + int Bank = -1); // Return true if register is regular VGPR or SGPR or their tuples. // Returns false for special registers like m0, vcc etc. @@ -280,7 +282,9 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); - if (Size > 32) + if (Size == 16) + Reg = TRI->get32BitRegister(Reg); + else if (Size > 32) Reg = TRI->getSubReg(Reg, AMDGPU::sub0); if (TRI->hasVGPRs(RC)) { @@ -292,7 +296,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; } -unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, +uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, int Bank) { if (Register::isVirtualRegister(Reg)) { if (!VRM->isAssignedReg(Reg)) @@ -306,14 +310,21 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, } const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - unsigned Size = TRI->getRegSizeInBits(*RC) / 32; - if (Size > 1) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned Size = TRI->getRegSizeInBits(*RC); + + if (Size == 16) { + Reg = TRI->get32BitRegister(Reg); + Size = 1; + } else { + Size /= 32; + if (Size > 1) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } if (TRI->hasVGPRs(RC)) { // VGPRs have 4 banks assigned in a round-robin fashion. Reg -= AMDGPU::VGPR0; - unsigned Mask = (1 << Size) - 1; + uint32_t Mask = maskTrailingOnes<uint32_t>(Size); unsigned Used = 0; // Bitmask lacks an extract method for (unsigned I = 0; I < Size; ++I) @@ -321,7 +332,7 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, Used |= 1 << I; RegsUsed.set(Reg, Reg + Size); Mask &= ~Used; - Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); + Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank); return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; } @@ -347,15 +358,14 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, return Mask << SGPR_BANK_OFFSET; } -unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, - unsigned& UsedBanks, - unsigned Reg, - int Bank) { +std::pair<unsigned, unsigned> +GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, + int Bank) { unsigned StallCycles = 0; - UsedBanks = 0; + unsigned UsedBanks = 0; if (MI.isDebugValue()) - return 0; + return std::make_pair(StallCycles, UsedBanks); RegsUsed.reset(); OperandMasks.clear(); @@ -372,30 +382,30 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { - unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); - if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { + unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); + LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); + if (Offset && Bank < NUM_VGPR_BANKS) { // If a register spans all banks we cannot shift it to avoid conflict. - if (countPopulation(LM) >= NUM_VGPR_BANKS) + if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) continue; - ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; - } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { + ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; + } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { // If a register spans all banks we cannot shift it to avoid conflict. - if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) + if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) continue; - ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + - (countTrailingZeros(LM) >> 1)) % - NUM_SGPR_BANKS; + ShiftedBank = SGPR_BANK_OFFSET + + (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; } } - unsigned Mask = getRegBankMask(R, Op.getSubReg(), + uint32_t Mask = getRegBankMask(R, Op.getSubReg(), (Reg == R) ? ShiftedBank : -1); StallCycles += countPopulation(UsedBanks & Mask); UsedBanks |= Mask; OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); } - return StallCycles; + return std::make_pair(StallCycles, UsedBanks); } unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, @@ -440,10 +450,19 @@ bool GCNRegBankReassign::isReassignable(unsigned Reg) const { } const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); + unsigned Size = TRI->getRegSizeInBits(*RC); + + // TODO: Support 16 bit registers. Those needs to be moved with their + // parent VGPR_32 and potentially a sibling 16 bit sub-register. + if (Size < 32) + return false; + if (TRI->hasVGPRs(RC)) return true; - unsigned Size = TRI->getRegSizeInBits(*RC); + if (Size == 16) + return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); + if (Size > 32) PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); @@ -496,16 +515,16 @@ unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); - unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); - if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { - unsigned Shift = countTrailingZeros(LM); + unsigned Offset = TRI->getChannelFromSubReg(SubReg); + if (Offset && (Mask & VGPR_BANK_MASK)) { + unsigned Shift = Offset; if (Shift >= NUM_VGPR_BANKS) return 0; unsigned VB = FreeBanks & VGPR_BANK_MASK; FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & VGPR_BANK_MASK; - } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { - unsigned Shift = countTrailingZeros(LM) >> 1; + } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { + unsigned Shift = Offset >> 1; if (Shift >= NUM_SGPR_BANKS) return 0; unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; @@ -570,7 +589,6 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg, int Bank, bool Collect) { unsigned TotalStallCycles = 0; - unsigned UsedBanks = 0; SmallSet<const MachineInstr *, 16> Visited; for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { @@ -578,7 +596,9 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, continue; if (!Visited.insert(&MI).second) continue; - unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); + unsigned StallCycles; + unsigned UsedBanks; + std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank); TotalStallCycles += StallCycles; if (Collect) collectCandidates(MI, UsedBanks, StallCycles); @@ -636,7 +656,11 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { struct BankStall { BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; - bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } + bool operator<(const BankStall &RHS) const { + if (Stalls == RHS.Stalls) + return Bank < RHS.Bank; + return Stalls > RHS.Stalls; + } unsigned Bank; unsigned Stalls; }; @@ -653,7 +677,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) { } } } - std::sort(BankStalls.begin(), BankStalls.end()); + llvm::sort(BankStalls); Register OrigReg = VRM->getPhys(C.Reg); LRM->unassign(LI); @@ -695,8 +719,9 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, if (MI.isBundle()) continue; // we analyze the instructions inside the bundle individually - unsigned UsedBanks = 0; - unsigned StallCycles = analyzeInst(MI, UsedBanks); + unsigned StallCycles; + unsigned UsedBanks; + std::tie(StallCycles, UsedBanks) = analyzeInst(MI); if (Collect) collectCandidates(MI, UsedBanks, StallCycles); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index d593204cba059..86a3cb9af32fa 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the GCNRegPressure class. +/// +//===----------------------------------------------------------------------===// #include "GCNRegPressure.h" #include "AMDGPUSubtarget.h" @@ -98,7 +103,8 @@ void GCNRegPressure::inc(unsigned Reg, LaneBitmask PrevMask, LaneBitmask NewMask, const MachineRegisterInfo &MRI) { - if (NewMask == PrevMask) + if (SIRegisterInfo::getNumCoveredRegs(NewMask) == + SIRegisterInfo::getNumCoveredRegs(PrevMask)) return; int Sign = 1; @@ -106,25 +112,21 @@ void GCNRegPressure::inc(unsigned Reg, std::swap(NewMask, PrevMask); Sign = -1; } -#ifndef NDEBUG - const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg); -#endif + switch (auto Kind = getRegKind(Reg, MRI)) { case SGPR32: case VGPR32: case AGPR32: - assert(PrevMask.none() && NewMask == MaxMask); Value[Kind] += Sign; break; case SGPR_TUPLE: case VGPR_TUPLE: case AGPR_TUPLE: - assert(NewMask < MaxMask || NewMask == MaxMask); assert(PrevMask < NewMask); Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += - Sign * (~PrevMask & NewMask).getNumLanes(); + Sign * SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask); if (PrevMask.none()) { assert(NewMask.any()); @@ -216,7 +218,7 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO, return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); - if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs + if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs return MaxMask; // For a tentative schedule LIS isn't updated yet but livemask should remain @@ -327,8 +329,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { // update max pressure MaxPressure = max(AtMIPressure, MaxPressure); - for (const auto &MO : MI.defs()) { - if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead()) + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || + !Register::isVirtualRegister(MO.getReg()) || MO.isDead()) continue; auto Reg = MO.getReg(); @@ -403,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() { LastTrackedMI = &*NextMI++; // Add new registers or mask bits. - for (const auto &MO : LastTrackedMI->defs()) { - if (!MO.isReg()) + for (const auto &MO : LastTrackedMI->operands()) { + if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); if (!Register::isVirtualRegister(Reg)) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 5862cdb041669..2ef79410719f6 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -5,6 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the GCNRegPressure class, which tracks registry pressure +/// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It +/// also implements a compare function, which compares different register +/// pressures, and declares one with max occupance as winner. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H @@ -208,7 +216,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) { auto SI = SII.getInstructionIndex(*I); Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex()); } - std::sort(Indexes.begin(), Indexes.end()); + llvm::sort(Indexes); auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo(); DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index e109eed5f6071..deed50b6db7df 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -50,9 +50,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy); } else { SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, - SRI->getSGPRPressureSet()); + AMDGPU::RegisterPressureSets::SReg_32); VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF, - SRI->getVGPRPressureSet()); + AMDGPU::RegisterPressureSets::VGPR_32); } SGPRCriticalLimit -= ErrorMargin; @@ -83,8 +83,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); } - unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()]; - unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; // If two instructions increase the pressure of different register sets // by the same amount, the generic scheduler will prefer to schedule the @@ -109,12 +109,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU // marked as RegExcess in tryCandidate() when they are compared with // instructions that increase the register pressure. if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { - Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); } if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { - Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); } @@ -128,10 +128,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU if (SGPRDelta >= 0 || VGPRDelta >= 0) { if (SGPRDelta > VGPRDelta) { - Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet()); + Cand.RPDelta.CriticalMax = + PressureChange(AMDGPU::RegisterPressureSets::SReg_32); Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); } else { - Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet()); + Cand.RPDelta.CriticalMax = + PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); } } @@ -145,8 +147,8 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SchedCandidate &Cand) { const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); - unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()]; - unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()]; + unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { @@ -231,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand; - if (TopCand.Reason == BotCand.Reason) { - Cand = BotCand; - GenericSchedulerBase::CandReason TopReason = TopCand.Reason; - TopCand.Reason = NoCand; - GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); - } else { - TopCand.Reason = TopReason; - } - } else { - if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = BotCand; - } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = BotCand; - } else { - if (BotCand.Reason > TopCand.Reason) { - Cand = TopCand; - } else { - Cand = BotCand; - } - } + SchedCandidate Cand = BotCand; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); @@ -316,13 +296,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, ST(MF.getSubtarget<GCNSubtarget>()), MFI(*MF.getInfo<SIMachineFunctionInfo>()), StartingOccupancy(MFI.getOccupancy()), - MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { + MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { - if (Stage == 0) { + if (Stage == Collect) { // Just record regions at the first pass. Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); return; @@ -348,6 +328,7 @@ void GCNScheduleDAGMILive::schedule() { ScheduleDAGMILive::schedule(); Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); + RescheduleRegions[RegionIdx] = false; if (!LIS) return; @@ -389,20 +370,28 @@ void GCNScheduleDAGMILive::schedule() { << MinOccupancy << ".\n"); } + unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); + if (PressureAfter.getVGPRNum() > MaxVGPRs || + PressureAfter.getSGPRNum() > MaxSGPRs) + RescheduleRegions[RegionIdx] = true; + if (WavesAfter >= MinOccupancy) { - unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); - unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST); - if (WavesAfter > MFI.getMinWavesPerEU() || + if (Stage == UnclusteredReschedule && + !PressureAfter.less(ST, PressureBefore)) { + LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); + } else if (WavesAfter > MFI.getMinWavesPerEU() || PressureAfter.less(ST, PressureBefore) || - (TotalVGPRs >= PressureAfter.getVGPRNum() && - TotalSGPRs >= PressureAfter.getSGPRNum())) { + !RescheduleRegions[RegionIdx]) { Pressure[RegionIdx] = PressureAfter; return; + } else { + LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); } - LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); } LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + RescheduleRegions[RegionIdx] = true; RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { if (MI->isDebugInstr()) @@ -532,33 +521,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() { LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); + RescheduleRegions.resize(Regions.size()); + RescheduleRegions.set(); if (!Regions.empty()) BBLiveInMap = getBBLiveInMap(); + std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations; + do { Stage++; RegionIdx = 0; MachineBasicBlock *MBB = nullptr; - if (Stage > 1) { + if (Stage > InitialSchedule) { + if (!LIS) + break; + // Retry function scheduling if we found resulting occupancy and it is // lower than used for first pass scheduling. This will give more freedom // to schedule low register pressure blocks. // Code is partially copied from MachineSchedulerBase::scheduleRegions(). - if (!LIS || StartingOccupancy <= MinOccupancy) - break; + if (Stage == UnclusteredReschedule) { + if (RescheduleRegions.none()) + continue; + LLVM_DEBUG(dbgs() << + "Retrying function scheduling without clustering.\n"); + } + + if (Stage == ClusteredLowOccupancyReschedule) { + if (StartingOccupancy <= MinOccupancy) + break; - LLVM_DEBUG( - dbgs() - << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); + LLVM_DEBUG( + dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); - S.setTargetOccupancy(MinOccupancy); + S.setTargetOccupancy(MinOccupancy); + } } + if (Stage == UnclusteredReschedule) + SavedMutations.swap(Mutations); + for (auto Region : Regions) { + if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) + continue; + RegionBegin = Region.first; RegionEnd = Region.second; @@ -566,7 +577,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() { if (MBB) finishBlock(); MBB = RegionBegin->getParent(); startBlock(MBB); - if (Stage == 1) + if (Stage == InitialSchedule) computeBlockPressure(MBB); } @@ -594,5 +605,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() { } finishBlock(); - } while (Stage < 2); + if (Stage == UnclusteredReschedule) + SavedMutations.swap(Mutations); + } while (Stage != LastStage); } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index dd687a930c79a..2d81d9977c31d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -64,6 +64,14 @@ public: class GCNScheduleDAGMILive final : public ScheduleDAGMILive { + enum : unsigned { + Collect, + InitialSchedule, + UnclusteredReschedule, + ClusteredLowOccupancyReschedule, + LastStage = ClusteredLowOccupancyReschedule + }; + const GCNSubtarget &ST; SIMachineFunctionInfo &MFI; @@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> Regions; + // Records if a region is not yet scheduled, or schedule has been reverted, + // or we generally desire to reschedule it. + BitVector RescheduleRegions; + // Region live-in cache. SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 1f94ab7991225..ea6e9038fd1e1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/TargetRegistry.h" #include "Utils/AMDGPUBaseInfo.h" @@ -39,8 +40,8 @@ public: const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; + void relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const override; bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const override; @@ -53,12 +54,13 @@ public: } //End anonymous namespace -void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, - MCInst &Res) const { +void AMDGPUAsmBackend::relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const { + MCInst Res; unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()); Res.setOpcode(RelaxedOpcode); Res.addOperand(Inst.getOperand(0)); + Inst = std::move(Res); return; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index d352219a7a982..619fde74e88d3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -6,8 +6,10 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUFixupKinds.h" #include "AMDGPUMCTargetDesc.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" @@ -80,6 +82,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_ABS64; } + if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) { + const auto *SymA = Target.getSymA(); + assert(SymA); + + Ctx.reportError(Fixup.getLoc(), + Twine("undefined label '") + SymA->getSymbol().getName() + "'"); + return ELF::R_AMDGPU_NONE; + } + llvm_unreachable("unhandled relocation type"); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index f65dc25d7eec5..fe063d33ea3e0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -18,6 +18,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -26,6 +27,28 @@ using namespace llvm; using namespace llvm::AMDGPU; +static cl::opt<bool> Keep16BitSuffixes( + "amdgpu-keep-16-bit-reg-suffixes", + cl::desc("Keep .l and .h suffixes in asm for debugging purposes"), + cl::init(false), + cl::ReallyHidden); + +void AMDGPUInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + // FIXME: The current implementation of + // AsmParser::parseRegisterOrRegisterNumber in MC implies we either emit this + // as an integer or we provide a name which represents a physical register. + // For CFI instructions we really want to emit a name for the DWARF register + // instead, because there may be multiple DWARF registers corresponding to a + // single physical register. One case where this problem manifests is with + // wave32/wave64 where using the physical register name is ambiguous: if we + // write e.g. `.cfi_undefined v0` we lose information about the wavefront + // size which we need to encode the register in the final DWARF. Ideally we + // would extend MC to support parsing DWARF register names so we could do + // something like `.cfi_undefined dwarf_wave32_v0`. For now we just live with + // non-pretty DWARF register names in assembly text. + OS << RegNo; +} + void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { @@ -164,10 +187,10 @@ void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo, printU32ImmOperand(MI, OpNo, STI, O); } -void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printU32ImmOperand(MI, OpNo, STI, O); + O << formatHex(MI->getOperand(OpNo).getImm()); } void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, @@ -244,6 +267,11 @@ void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "r128"); } +void AMDGPUInstPrinter::printGFX10A16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "a16"); +} + void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "lwe"); @@ -287,7 +315,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, switch (RegNo) { case AMDGPU::FP_REG: case AMDGPU::SP_REG: - case AMDGPU::SCRATCH_WAVE_OFFSET_REG: case AMDGPU::PRIVATE_RSRC_REG: llvm_unreachable("pseudo-register should not ever be emitted"); case AMDGPU::SCC: @@ -297,7 +324,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } #endif - O << getRegisterName(RegNo); + StringRef RegName(getRegisterName(RegNo)); + if (!Keep16BitSuffixes) + if (!RegName.consume_back(".l")) + RegName.consume_back(".h"); + + O << RegName; } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, @@ -346,11 +378,21 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, printOperand(MI, OpNo, STI, O); } +void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast<int16_t>(Imm); + if (isInlinableIntLiteral(SImm)) + O << SImm; + else + O << formatHex(static_cast<uint64_t>(Imm)); +} + void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { int16_t SImm = static_cast<int16_t>(Imm); - if (SImm >= -16 && SImm <= 64) { + if (isInlinableIntLiteral(SImm)) { O << SImm; return; } @@ -518,7 +560,8 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { printRegOperand(Op.getReg(), O, MRI); } else if (Op.isImm()) { - switch (Desc.OpInfo[OpNo].OperandType) { + const uint8_t OpTy = Desc.OpInfo[OpNo].OperandType; + switch (OpTy) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: @@ -535,10 +578,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printImmediate64(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_IMM_INT16: + printImmediateInt16(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_IMM_FP16: printImmediate16(Op.getImm(), STI, O); break; @@ -549,11 +594,19 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printImmediate32(Op.getImm(), STI, O); break; } + + // Deal with 16-bit FP inline immediates not working. + if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) { + printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O); + break; + } LLVM_FALLTHROUGH; - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O); + break; + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: printImmediateV216(Op.getImm(), STI, O); break; case MCOI::OPERAND_UNKNOWN: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index ba53003e90413..6dfd23ea72e67 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -23,6 +23,7 @@ public: : MCInstPrinter(MAI, MII, MRI) {} //Autogenerated by tblgen + void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); @@ -60,7 +61,7 @@ private: raw_ostream &O); void printSMRDOffset8(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printSMRDOffset20(const MCInst *MI, unsigned OpNo, + void printSMEMOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -86,6 +87,8 @@ private: raw_ostream &O); void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printGFX10A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printD16(const MCInst *MI, unsigned OpNo, @@ -102,8 +105,12 @@ private: raw_ostream &O); void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediateIntV216(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, @@ -112,6 +119,10 @@ private: raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printOperand(MI, OpNum, STI, O); + } void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 9644e66fda4e5..687cfef4559f3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -43,6 +43,9 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; + DwarfRegNumForCFI = true; + + UseIntegratedAssembler = false; } bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 62757a7078905..d7d8c8181b02f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -51,6 +51,12 @@ public: return 0; } + virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 9507836c64c2b..7d3235efc59e6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -61,7 +61,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { if (TT.getArch() == Triple::r600) InitR600MCRegisterInfo(X, 0); else - InitAMDGPUMCRegisterInfo(X, 0); + InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG); + return X; +} + +MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour); return X; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 9754d31fee600..b9cdbc6502e57 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -33,6 +33,10 @@ class Target; class Triple; class raw_pwrite_stream; +enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 }; + +MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour); + MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index fef665c2900ef..3d202d7960d65 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -43,7 +43,7 @@ using namespace llvm::AMDGPU::HSAMD; bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { HSAMD::Metadata HSAMetadata; - if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) + if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata)) return false; return EmitHSAMetadata(HSAMetadata); @@ -97,6 +97,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } @@ -148,6 +149,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; + case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -210,9 +212,9 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) { - OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align - << '\n'; + Align Alignment) { + OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " + << Alignment.value() << '\n'; } bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { @@ -393,9 +395,9 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// -AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer( - MCStreamer &S, const MCSubtargetInfo &STI) - : AMDGPUTargetStreamer(S), Streamer(S) { +AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S, + const MCSubtargetInfo &STI) + : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) { MCAssembler &MCA = getStreamer().getAssembler(); unsigned EFlags = MCA.getELFHeaderEFlags(); @@ -427,7 +429,7 @@ void AMDGPUTargetELFStreamer::finish() { if (Blob.empty()) return; EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type, - [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); }); + [&](MCELFStreamer &OS) { OS.emitBytes(Blob); }); } void AMDGPUTargetELFStreamer::EmitNote( @@ -438,16 +440,22 @@ void AMDGPUTargetELFStreamer::EmitNote( auto NameSZ = Name.size() + 1; + unsigned NoteFlags = 0; + // TODO Apparently, this is currently needed for OpenCL as mentioned in + // https://reviews.llvm.org/D74995 + if (Os == Triple::AMDHSA) + NoteFlags = ELF::SHF_ALLOC; + S.PushSection(); - S.SwitchSection(Context.getELFSection( - ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); - S.EmitIntValue(NameSZ, 4); // namesz - S.EmitValue(DescSZ, 4); // descz - S.EmitIntValue(NoteType, 4); // type - S.EmitBytes(Name); // name - S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 + S.SwitchSection( + Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags)); + S.emitInt32(NameSZ); // namesz + S.emitValue(DescSZ, 4); // descz + S.emitInt32(NoteType); // type + S.emitBytes(Name); // name + S.emitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc - S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 + S.emitValueToAlignment(4, 0, 1, 0); // padding 0 S.PopSection(); } @@ -458,8 +466,8 @@ void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()), ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); + OS.emitInt32(Major); + OS.emitInt32(Minor); }); } @@ -478,15 +486,15 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()), ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { - OS.EmitIntValue(VendorNameSize, 2); - OS.EmitIntValue(ArchNameSize, 2); - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - OS.EmitIntValue(Stepping, 4); - OS.EmitBytes(VendorName); - OS.EmitIntValue(0, 1); // NULL terminate VendorName - OS.EmitBytes(ArchName); - OS.EmitIntValue(0, 1); // NULL terminte ArchName + OS.emitInt16(VendorNameSize); + OS.emitInt16(ArchNameSize); + OS.emitInt32(Major); + OS.emitInt32(Minor); + OS.emitInt32(Stepping); + OS.emitBytes(VendorName); + OS.emitInt8(0); // NULL terminate VendorName + OS.emitBytes(ArchName); + OS.emitInt8(0); // NULL terminte ArchName }); } @@ -495,7 +503,7 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); + OS.emitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } @@ -507,9 +515,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) { - assert(isPowerOf2_32(Align)); - + Align Alignment) { MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol); SymbolELF->setType(ELF::STT_OBJECT); @@ -518,7 +524,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, SymbolELF->setExternal(true); } - if (SymbolELF->declareCommon(Size, Align, true)) { + if (SymbolELF->declareCommon(Size, Alignment.value(), true)) { report_fatal_error("Symbol: " + Symbol->getName() + " redeclared as different type"); } @@ -539,9 +545,9 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA, [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(IsaVersionString); - OS.EmitLabel(DescEnd); + OS.emitLabel(DescBegin); + OS.emitBytes(IsaVersionString); + OS.emitLabel(DescEnd); }); return true; } @@ -566,9 +572,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc, EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA, [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(HSAMetadataString); - OS.EmitLabel(DescEnd); + OS.emitLabel(DescBegin); + OS.emitBytes(HSAMetadataString); + OS.emitLabel(DescEnd); }); return true; } @@ -590,9 +596,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA, [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(HSAMetadataString); - OS.EmitLabel(DescEnd); + OS.emitLabel(DescBegin); + OS.emitBytes(HSAMetadataString); + OS.emitLabel(DescEnd); }); return true; } @@ -602,9 +608,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() { MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); + OS.emitValueToAlignment(64, Encoded_s_code_end, 4); for (unsigned I = 0; I < 48; ++I) - OS.EmitIntValue(Encoded_s_code_end, 4); + OS.emitInt32(Encoded_s_code_end); OS.PopSection(); return true; } @@ -637,22 +643,22 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT) KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED); - Streamer.EmitLabel(KernelDescriptorSymbol); - Streamer.EmitBytes(StringRef( + Streamer.emitLabel(KernelDescriptorSymbol); + Streamer.emitBytes(StringRef( (const char*)&(KernelDescriptor), offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset))); // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The // expression being created is: // (start of kernel code) - (start of kernel descriptor) // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64. - Streamer.EmitValue(MCBinaryExpr::createSub( + Streamer.emitValue(MCBinaryExpr::createSub( MCSymbolRefExpr::create( KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context), MCSymbolRefExpr::create( KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context), Context), sizeof(KernelDescriptor.kernel_code_entry_byte_offset)); - Streamer.EmitBytes(StringRef( + Streamer.emitBytes(StringRef( (const char*)&(KernelDescriptor) + offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) + sizeof(KernelDescriptor.kernel_code_entry_byte_offset), diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 683b3e363b9aa..a19d4646deb26 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -54,7 +54,7 @@ public: virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, - unsigned Align) = 0; + Align Alignment) = 0; /// \returns True on success, false on failure. virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; @@ -110,7 +110,7 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; @@ -133,6 +133,7 @@ public: class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; + Triple::OSType Os; void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc); @@ -157,7 +158,7 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; - void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override; + void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; /// \returns True on success, false on failure. bool EmitISAVersion(StringRef IsaVersionString) override; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 2f1f4e7a03928..f614705730501 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -47,7 +47,7 @@ public: /// Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI) const override; /// \returns the encoding for an MCOperand. uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index f8ec3c36f0190..2cd6c3a81d2bf 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPURegisterInfo.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -71,6 +70,10 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -105,6 +108,11 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) { return 0; } +static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); + return IntImm == 0 ? 255 : IntImm; +} + static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) { uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); if (IntImm != 0) @@ -249,23 +257,27 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, return getLit64Encoding(static_cast<uint64_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: // FIXME Is this correct? What do inline immediates do on SI for f16 src // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: { if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) return getLit32Encoding(static_cast<uint32_t>(Imm), STI); + if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) + return getLit16Encoding(static_cast<uint16_t>(Imm), STI); LLVM_FALLTHROUGH; + } case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint16_t Lo16 = static_cast<uint16_t>(Imm); uint32_t Encoding = getLit16Encoding(Lo16, STI); @@ -359,6 +371,15 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); } +unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + auto Offset = MI.getOperand(OpNo).getImm(); + // VI only supports 20-bit unsigned offsets. + assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset)); + return Offset; +} + unsigned SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, @@ -419,7 +440,13 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, // instructions use acc[0:1] modifier bits to distinguish. These bits are // encoded as a virtual 9th bit of the register for these operands. if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg)) + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; return Enc; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 4006a6205fb87..2bfc2d5795333 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1,4 +1,4 @@ -//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===// +//===-- MIMGInstructions.td - MIMG Instruction Definitions ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -35,6 +35,7 @@ class MIMGBaseOpcode : PredicateControl { bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; + bit G16 = 0; bit Coordinates = 1; bit LodOrClampOrMip = 0; bit HasD16 = 0; @@ -47,9 +48,9 @@ def MIMGBaseOpcode : GenericEnum { def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", - "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", - "HasD16"]; + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", + "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; let PrimaryKey = ["BaseOpcode"]; @@ -117,6 +118,22 @@ def MIMGMIPMappingTable : GenericTable { let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> { + MIMGBaseOpcode G = g; + MIMGBaseOpcode G16 = g16; +} + +def MIMGG16MappingTable : GenericTable { + let FilterClass = "MIMGG16Mapping"; + let CppTypeName = "MIMGG16MappingInfo"; + let Fields = ["G", "G16"]; + GenericEnum TypeOf_G = MIMGBaseOpcode; + GenericEnum TypeOf_G16 = MIMGBaseOpcode; + + let PrimaryKey = ["G"]; + let PrimaryKeyName = "getMIMGG16MappingInfo"; +} + class MIMG_Base <dag outs, string dns = ""> : InstSI <outs, (ins), "", []> { @@ -132,7 +149,6 @@ class MIMG_Base <dag outs, string dns = ""> let DecoderNamespace = dns; let isAsmParserOnly = !if(!eq(dns,""), 1, 0); - let usesCustomInserter = 1; } class MIMG <dag outs, string dns = ""> @@ -238,9 +254,9 @@ class MIMG_NoSampler_gfx10<int op, string opcode, : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -251,9 +267,9 @@ class MIMG_NoSampler_nsa_gfx10<int op, string opcode, let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -331,9 +347,9 @@ class MIMG_Store_gfx10<int op, string opcode, : MIMG_gfx10<op, (outs), dns> { let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -345,9 +361,9 @@ class MIMG_Store_nsa_gfx10<int op, string opcode, AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe" + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -436,8 +452,8 @@ class MIMG_Atomic_gfx10<mimg op, string opcode, let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; + GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe); + let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimg op, string opcode, @@ -452,8 +468,8 @@ class MIMG_Atomic_nsa_gfx10<mimg op, string opcode, AddrIns, (ins SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe)); - let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"; + SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe)); + let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"; } multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm, @@ -522,10 +538,10 @@ class MIMG_Sampler_gfx10<int op, string opcode, : MIMG_gfx10<op, (outs DataRC:$vdata), dns> { let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, - GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm" - #"$dlc$glc$slc$r128$tfe$lwe" + #"$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -536,10 +552,10 @@ class MIMG_Sampler_nsa_gfx10<int op, string opcode, let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc, - SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe), + SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm" - #"$dlc$glc$slc$r128$tfe$lwe" + #"$dlc$glc$slc$r128$a16$tfe$lwe" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -646,10 +662,11 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample> } multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0, - bit isGetLod = 0, - string asm = "image_sample"#sample.LowerCaseMod> { + bit isG16 = 0, bit isGetLod = 0, + string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = !if(isGetLod, 0, 1); + let G16 = isG16; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, @@ -726,76 +743,95 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI //} // End let FPAtomic = 1 -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; - -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">; - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>; +defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>; +defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>; +defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; + +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; +defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>; +defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>; +defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>; +defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; +let SubtargetPredicate = HasGFX10_BEncoding in +defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>; + /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ /********** ========================================= **********/ @@ -817,6 +853,11 @@ def ImageDimIntrinsicTable : GenericTable { let PrimaryKeyEarlyOut = 1; } +def getImageDimInstrinsicByBaseOpcode : SearchIndex { + let Table = ImageDimIntrinsicTable; + let Key = ["BaseOpcode", "Dim"]; +} + foreach intr = !listconcat(AMDGPUImageDimIntrinsics, AMDGPUImageDimAtomicIntrinsics) in { def : ImageDimIntrinsicInfo<intr>; @@ -835,3 +876,21 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; // MIP to NONMIP Optimization Mapping def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>; def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>; + +// G to G16 Optimization Mapping +def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_D, IMAGE_SAMPLE_C_D_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL, IMAGE_SAMPLE_C_D_CL_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_CD, IMAGE_SAMPLE_CD_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL, IMAGE_SAMPLE_CD_CL_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD, IMAGE_SAMPLE_C_CD_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL, IMAGE_SAMPLE_C_CD_CL_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD_O_G16>; +def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL_O_G16>; diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp index ed23c8ea814b4..d363baa15507a 100644 --- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -88,15 +88,15 @@ void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + OutStreamer->emitInt32(RsrcReg); + OutStreamer->emitIntValue(S_NUM_GPRS(MaxGPR + 1) | S_STACK_SIZE(MFI->CFStackSize), 4); - OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); - OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + OutStreamer->emitInt32(R_02880C_DB_SHADER_CONTROL); + OutStreamer->emitInt32(S_02880C_KILL_ENABLE(killPixel)); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); + OutStreamer->emitInt32(R_0288E8_SQ_LDS_ALLOC); + OutStreamer->emitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); } } @@ -115,7 +115,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitProgramInfoR600(MF); - EmitFunctionBody(); + emitFunctionBody(); if (isVerbose()) { MCSectionELF *CommentSection = diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h index 0da9526d716ea..552d01f81b66c 100644 --- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h @@ -26,7 +26,7 @@ public: StringRef getPassName() const override; bool runOnMachineFunction(MachineFunction &MF) override; /// Implemented in AMDGPUMCInstLower.cpp - void EmitInstruction(const MachineInstr *MI) override; + void emitInstruction(const MachineInstr *MI) override; /// Lower the specified LLVM Constant to an MCExpr. /// The AsmPrinter::lowerConstantof does not know how to lower /// addrspacecast, therefore they should be lowered by this function. diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index e4160ac11c863..8124df68f6886 100644 --- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -159,8 +159,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { } void CFStack::updateMaxStackSize() { - unsigned CurrentStackSize = - CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4); + unsigned CurrentStackSize = CurrentEntries + divideCeil(CurrentSubEntries, 4); MaxStackSize = std::max(CurrentStackSize, MaxStackSize); } @@ -308,7 +307,7 @@ private: DstMI = Reg; else DstMI = TRI->getMatchingSuperReg(Reg, - AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), &R600::R600_Reg128RegClass); } if (MO.isUse()) { @@ -317,7 +316,7 @@ private: SrcMI = Reg; else SrcMI = TRI->getMatchingSuperReg(Reg, - AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), &R600::R600_Reg128RegClass); } } diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index fd75c41040e16..5f682d86d26e2 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -219,13 +219,13 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { } } if (IsReduction) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); + unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan); Src0 = TRI.getSubReg(Src0, SubRegIndex); Src1 = TRI.getSubReg(Src1, SubRegIndex); } else if (IsCube) { static const int CubeSrcSwz[] = {2, 2, 0, 1}; - unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]); - unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + unsigned SubRegIndex0 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]); Src1 = TRI.getSubReg(Src0, SubRegIndex1); Src0 = TRI.getSubReg(Src0, SubRegIndex0); } @@ -234,7 +234,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { bool Mask = false; bool NotLast = true; if (IsCube) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); + unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan); DstReg = TRI.getSubReg(DstReg, SubRegIndex); } else { // Mask the write if the original instruction does not write to diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp index d9aa9ebe878d8..c568a4aa61c3e 100644 --- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -18,9 +18,8 @@ using namespace llvm; R600FrameLowering::~R600FrameLowering() = default; /// \returns The number of registers allocated for \p FI. -int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { +int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const R600RegisterInfo *RI = MF.getSubtarget<R600Subtarget>().getRegisterInfo(); @@ -35,15 +34,15 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int UpperBound = FI == -1 ? MFI.getNumObjects() : FI; for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i)); + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(i)); OffsetBytes += MFI.getObjectSize(i); // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = alignTo(OffsetBytes, 4); + OffsetBytes = alignTo(OffsetBytes, Align(4)); } if (FI != -1) - OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI)); + OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI)); return OffsetBytes / (getStackWidth(MF) * 4); } diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h index 283e4d1935ea1..b877ecd298290 100644 --- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h +++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h @@ -16,7 +16,7 @@ namespace llvm { class R600FrameLowering : public AMDGPUFrameLowering { public: R600FrameLowering(StackDirection D, Align StackAl, int LAO, - Align TransAl = Align::None()) + Align TransAl = Align(1)) : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~R600FrameLowering() override; @@ -25,7 +25,7 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override {} int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; bool hasFP(const MachineFunction &MF) const override { return false; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 1b1f5f9a404a7..dc2e73e1f94e0 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -615,21 +615,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return LowerImplicitParameter(DAG, VT, DL, 8); case Intrinsic::r600_read_tgid_x: + case Intrinsic::amdgcn_workgroup_id_x: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T1_X, VT); case Intrinsic::r600_read_tgid_y: + case Intrinsic::amdgcn_workgroup_id_y: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T1_Y, VT); case Intrinsic::r600_read_tgid_z: + case Intrinsic::amdgcn_workgroup_id_z: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T1_Z, VT); case Intrinsic::r600_read_tidig_x: + case Intrinsic::amdgcn_workitem_id_x: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T0_X, VT); case Intrinsic::r600_read_tidig_y: + case Intrinsic::amdgcn_workitem_id_y: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T0_Y, VT); case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_workitem_id_z: return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, R600::T0_Z, VT); @@ -699,9 +705,8 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, SmallVector<SDValue, 8> Args; for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { - Args.push_back(DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, - DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getVectorIdxConstant(i, DL))); } return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); @@ -1260,10 +1265,11 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return scalarizeVectorStore(StoreNode, DAG); } - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses( - MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { + Align Alignment = StoreNode->getAlign(); + if (Alignment < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(), + StoreNode->getMemOperand()->getFlags(), + nullptr)) { return expandUnalignedStore(StoreNode, DAG); } @@ -1543,7 +1549,7 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); - unsigned IgnoredFrameReg; + Register IgnoredFrameReg; unsigned Offset = TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 346296c773775..088cf16d8ed2c 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -77,7 +77,7 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (VectorComponents > 0) { for (unsigned I = 0; I < VectorComponents; I++) { - unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I); + unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(I); buildDefaultInstruction(MBB, MI, R600::MOV, RI.getSubReg(DestReg, SubRegIndex), RI.getSubReg(SrcReg, SubRegIndex)) @@ -541,7 +541,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs; ValidSwizzle.clear(); - unsigned ConstCount = 0; + unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; for (unsigned i = 0, e = IG.size(); i < e; ++i) { IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); @@ -676,7 +676,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { - // Most of the following comes from the ARM implementation of AnalyzeBranch + // Most of the following comes from the ARM implementation of analyzeBranch // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); @@ -1224,7 +1224,7 @@ int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); const R600FrameLowering *TFL = ST.getFrameLowering(); - unsigned IgnoredFrameReg; + Register IgnoredFrameReg; Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg); return getIndirectIndexBegin(MF) + Offset; diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index cbdf0de44f873..2cc21364c4397 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1006,7 +1006,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP < class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < inst, "MULADD_IEEE", - [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] + [(set f32:$dst, (any_fmad f32:$src0, f32:$src1, f32:$src2))] >; class FMA_Common <bits<5> inst> : R600_3OP < @@ -1233,6 +1233,11 @@ def : R600Pat< def : RcpPat<recip_ieee, f32>; } +class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat < + (fsqrt f32:$src), + (RecipInst (RsqInst $src)) +>; + //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1272,8 +1277,8 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; - def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat<RECIPSQRT_IEEE_r600, f32>; + def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index cec7f563f4800..b0620663a2300 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -56,9 +56,8 @@ using namespace llvm; #define DEBUG_TYPE "vec-merger" -static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { - assert(MRI.isSSA()); - if (Register::isPhysicalRegister(Reg)) +static bool isImplicitlyDef(MachineRegisterInfo &MRI, Register Reg) { + if (Reg.isPhysical()) return false; const MachineInstr *MI = MRI.getUniqueVRegDef(Reg); return MI && MI->isImplicitDef(); @@ -69,8 +68,8 @@ namespace { class RegSeqInfo { public: MachineInstr *Instr; - DenseMap<unsigned, unsigned> RegToChan; - std::vector<unsigned> UndefReg; + DenseMap<Register, unsigned> RegToChan; + std::vector<Register> UndefReg; RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { assert(MI->getOpcode() == R600::REG_SEQUENCE); @@ -102,7 +101,7 @@ private: InstructionSetMap PreviousRegSeqByUndefCount; bool canSwizzle(const MachineInstr &MI) const; - bool areAllUsesSwizzeable(unsigned Reg) const; + bool areAllUsesSwizzeable(Register Reg) const; void SwizzleInput(MachineInstr &, const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const; bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge, @@ -130,6 +129,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA); + } + StringRef getPassName() const override { return "R600 Vector Registers Merge Pass"; } @@ -165,9 +169,9 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap) const { unsigned CurrentUndexIdx = 0; - for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(), + for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(), E = ToMerge->RegToChan.end(); It != E; ++It) { - DenseMap<unsigned, unsigned>::const_iterator PosInUntouched = + DenseMap<Register, unsigned>::const_iterator PosInUntouched = Untouched->RegToChan.find((*It).first); if (PosInUntouched != Untouched->RegToChan.end()) { Remap.push_back(std::pair<unsigned, unsigned> @@ -203,9 +207,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector( DebugLoc DL = Pos->getDebugLoc(); Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); - DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; - std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; - for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), + DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; + std::vector<Register> UpdatedUndef = BaseRSI->UndefReg; + for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(), E = RSI->RegToChan.end(); It != E; ++It) { Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); unsigned SubReg = (*It).first; @@ -218,7 +222,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector( .addReg(SubReg) .addImm(Chan); UpdatedRegToChan[SubReg] = Chan; - std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan); + std::vector<Register>::iterator ChanPos = llvm::find(UpdatedUndef, Chan); if (ChanPos != UpdatedUndef.end()) UpdatedUndef.erase(ChanPos); assert(!is_contained(UpdatedUndef, Chan) && @@ -279,7 +283,7 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, } } -bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { +bool R600VectorRegMerger::areAllUsesSwizzeable(Register Reg) const { for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), E = MRI->use_instr_end(); It != E; ++It) { if (!canSwizzle(*It)) @@ -322,7 +326,7 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, } void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { - for (DenseMap<unsigned, unsigned>::const_iterator + for (DenseMap<Register, unsigned>::const_iterator It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); } diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp index ef12c1d245941..78ef71cdf8e3b 100644 --- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -20,14 +20,21 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) { - RCW.RegWeight = 0; - RCW.WeightLimit = 0; -} - #define GET_REGINFO_TARGET_DESC #include "R600GenRegisterInfo.inc" +unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) { + static const uint16_t SubRegFromChannelTable[] = { + R600::sub0, R600::sub1, R600::sub2, R600::sub3, + R600::sub4, R600::sub5, R600::sub6, R600::sub7, + R600::sub8, R600::sub9, R600::sub10, R600::sub11, + R600::sub12, R600::sub13, R600::sub14, R600::sub15 + }; + + assert(Channel < array_lengthof(SubRegFromChannelTable)); + return SubRegFromChannelTable[Channel]; +} + BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -87,11 +94,6 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( } } -const RegClassWeight &R600RegisterInfo::getRegClassWeight( - const TargetRegisterClass *RC) const { - return RCW; -} - bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { assert(!Register::isVirtualRegister(Reg)); diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h index 9378b70ca5807..06981c4cf9c5e 100644 --- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -20,9 +20,11 @@ namespace llvm { struct R600RegisterInfo final : public R600GenRegisterInfo { - RegClassWeight RCW; + R600RegisterInfo() : R600GenRegisterInfo(0) {} - R600RegisterInfo(); + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> R600::sub0) + static unsigned getSubRegFromChannel(unsigned Channel); BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; @@ -37,8 +39,9 @@ struct R600RegisterInfo final : public R600GenRegisterInfo { /// CFGStructurizer const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; - const RegClassWeight & - getRegClassWeight(const TargetRegisterClass *RC) const override; + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { + return false; + } // \returns true if \p Reg can be defined in one ALU clause and used in // another. diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td index 02164b74a01bd..fdff7541edeca 100644 --- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td @@ -150,13 +150,16 @@ def AR_X : R600Reg<"AR.x", 0>; def INDIRECT_BASE_ADDR : R600Reg <"INDIRECT_BASE_ADDR", 0>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "ArrayBase%u", 448, 480))>; + (add (sequence "ArrayBase%u", 448, 480))> { + let Weight = 0; +} // special registers for ALU src operands // const buffer reference, SRCx_SEL contains index def ALU_CONST : R600Reg<"CBuf", 0>; // interpolation param reference, SRCx_SEL contains index def ALU_PARAM : R600Reg<"Param", 0>; +let Weight = 0 in { let isAllocatable = 0 in { def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; @@ -251,3 +254,4 @@ def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64, def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, (add V01_X, V01_Y, V01_Z, V01_W, V23_X, V23_Y, V23_Z, V23_W)>; +} // End let Weight = 0 diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp index ee011286b8ff3..90e48c63b5dca 100644 --- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -111,10 +111,6 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { unsigned ActiveLanes = TII->isGather4(Opcode) ? 4 : countPopulation(dmask); - // Subreg indices are counted from 1 - // When D16 then we want next whole VGPR after write data. - static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected"); - bool Packed = !ST.hasUnpackedD16VMem(); unsigned InitIdx = @@ -137,7 +133,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { // all the result registers to 0, otherwise just the error indication // register (VGPRn+1) unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; - unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx; + unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1); if (DstSize == 1) { // In this case we can just initialize the result directly @@ -158,7 +154,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) .addReg(PrevDst) .addReg(SubReg) - .addImm(CurrIdx); + .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); PrevDst = NewDst; } diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 27320472cacb3..3c41bf1fef5e9 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -153,7 +153,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else, { IntMask, IntMask }); IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, - { IntMask, IntMask }); + { IntMask }); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 23ef56afc39c9..4f7d255eb450a 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -333,7 +333,9 @@ enum Id { // HwRegCode, (6) [5:0] ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, ID_POPS_PACKER = 25, - ID_SYMBOLIC_LAST_ = 26, + ID_SHADER_CYCLES = 29, + ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES, + ID_SYMBOLIC_LAST_ = 30, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -366,6 +368,28 @@ enum Width : unsigned { WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1, }; +enum ModeRegisterMasks : uint32_t { + FP_ROUND_MASK = 0xf << 0, // Bits 0..3 + FP_DENORM_MASK = 0xf << 4, // Bits 4..7 + DX10_CLAMP_MASK = 1 << 8, + IEEE_MODE_MASK = 1 << 9, + LOD_CLAMP_MASK = 1 << 10, + DEBUG_MASK = 1 << 11, + + // EXCP_EN fields. + EXCP_EN_INVALID_MASK = 1 << 12, + EXCP_EN_INPUT_DENORMAL_MASK = 1 << 13, + EXCP_EN_FLOAT_DIV0_MASK = 1 << 14, + EXCP_EN_OVERFLOW_MASK = 1 << 15, + EXCP_EN_UNDERFLOW_MASK = 1 << 16, + EXCP_EN_INEXACT_MASK = 1 << 17, + EXCP_EN_INT_DIV0_MASK = 1 << 18, + + GPR_IDX_EN_MASK = 1 << 27, + VSKIP_MASK = 1 << 28, + CSP_MASK = 0x7u << 29 // Bits 29..31 +}; + } // namespace Hwreg namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 914d2a5ef1485..ef64c5674bd1c 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -587,6 +587,11 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + // Only need to run this in SelectionDAG path. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Selected)) + return false; + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); @@ -761,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { bool AllAGPRUses = true; SetVector<const MachineInstr *> worklist; SmallSet<const MachineInstr *, 4> Visited; + SetVector<MachineInstr *> PHIOperands; worklist.insert(&MI); Visited.insert(&MI); while (!worklist.empty()) { @@ -805,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) { LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0)); + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg()); + if (DefMI && DefMI->isPHI()) + PHIOperands.insert(DefMI); + } } bool hasVGPRInput = false; @@ -824,8 +835,22 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { } else if (Def->isCopy() && TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) { - hasVGPRInput = true; - break; + Register SrcReg = Def->getOperand(1).getReg(); + MachineInstr *SrcDef = MRI->getVRegDef(SrcReg); + unsigned SMovOp; + int64_t Imm; + if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) { + hasVGPRInput = true; + break; + } else { + // Formally, if we did not do this right away + // it would be done on the next iteration of the + // runOnMachineFunction main loop. But why not if we can? + MachineFunction *MF = MI.getParent()->getParent(); + Def->getOperand(1).ChangeToImmediate(Imm); + Def->addImplicitDefUseOperands(*MF); + Def->setDesc(TII->get(SMovOp)); + } } } @@ -840,4 +865,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { TII->legalizeOperands(MI, MDT); } + // Propagate register class back to PHI operands which are PHI themselves. + while (!PHIOperands.empty()) { + processPHINode(*PHIOperands.pop_back_val()); + } } diff --git a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp index a0119297b112f..8e3402b537b3b 100644 --- a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -217,6 +217,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB, } bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + // Only need to run this in SelectionDAG path. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Selected)) + return false; + if (skipFunction(MF.getFunction())) return false; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2ff8baf29394f..ffcf4c30bc70d 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -282,6 +282,9 @@ static bool updateOperand(FoldCandidate &Fold, assert(!Fold.needsShrink() && "not handled"); if (Fold.isImm()) { + // FIXME: ChangeToImmediate should probably clear the subreg flags. It's + // reinterpreted as TargetFlags. + Old.setSubReg(0); Old.ChangeToImmediate(Fold.ImmToFold); return true; } @@ -612,19 +615,26 @@ void SIFoldOperands::foldOperand( if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. - MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); - if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && - SOff->getReg() != MFI->getStackPtrOffsetReg())) - return; if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != MFI->getScratchRSrcReg()) return; + // Ensure this is either relative to the current frame or the current wave. + MachineOperand &SOff = + *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) && + (!SOff.isImm() || SOff.getImm() != 0)) + return; + // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); - SOff->setReg(MFI->getStackPtrOffsetReg()); + + // If this is relative to the current wave, update it to be relative to the + // current frame. + if (SOff.isImm()) + SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false); return; } @@ -907,6 +917,21 @@ static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, case AMDGPU::S_XOR_B32: Result = LHS ^ RHS; return true; + case AMDGPU::S_XNOR_B32: + Result = ~(LHS ^ RHS); + return true; + case AMDGPU::S_NAND_B32: + Result = ~(LHS & RHS); + return true; + case AMDGPU::S_NOR_B32: + Result = ~(LHS | RHS); + return true; + case AMDGPU::S_ANDN2_B32: + Result = LHS & ~RHS; + return true; + case AMDGPU::S_ORN2_B32: + Result = LHS | ~RHS; + return true; case AMDGPU::V_LSHL_B32_e64: case AMDGPU::V_LSHL_B32_e32: case AMDGPU::S_LSHL_B32: @@ -1007,10 +1032,16 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, if (!Src0->isImm() && !Src1->isImm()) return false; - if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) { + if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32 || + MI->getOpcode() == AMDGPU::V_LSHL_ADD_U32 || + MI->getOpcode() == AMDGPU::V_AND_OR_B32) { if (Src0->isImm() && Src0->getImm() == 0) { // v_lshl_or_b32 0, X, Y -> copy Y // v_lshl_or_b32 0, X, K -> v_mov_b32 K + // v_lshl_add_b32 0, X, Y -> copy Y + // v_lshl_add_b32 0, X, K -> v_mov_b32 K + // v_and_or_b32 0, X, Y -> copy Y + // v_and_or_b32 0, X, K -> v_mov_b32 K bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg(); MI->RemoveOperand(Src1Idx); MI->RemoveOperand(Src0Idx); @@ -1381,8 +1412,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) || - (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals)) + if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) || + (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1411,8 +1442,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) || - (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals)) + if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) || + (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 8364665dda04c..a2e802009d098 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -24,18 +24,6 @@ using namespace llvm; #define DEBUG_TYPE "frame-info" -static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), - ST.getMaxNumSGPRs(MF) / 4); -} - -static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - ST.getMaxNumSGPRs(MF)); -} - // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -47,10 +35,10 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, // but we would then have to make sure that we were in fact saving at least one // callee-save register in the prologue, which is additional complexity that // doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC, - bool Unused = false) { +static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { // Mark callee saved registers as used so we will not choose them. const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); for (unsigned i = 0; CSRegs[i]; ++i) @@ -59,12 +47,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, if (Unused) { // We are looking for a register that can be used throughout the entire // function, so any use is unacceptable. - for (unsigned Reg : RC) { + for (MCRegister Reg : RC) { if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) return Reg; } } else { - for (unsigned Reg : RC) { + for (MCRegister Reg : RC) { if (LiveRegs.available(MRI, Reg)) return Reg; } @@ -76,14 +64,67 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, if (!Unused) report_fatal_error("failed to find free scratch register"); - return AMDGPU::NoRegister; + return MCRegister(); } -static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { - LivePhysRegs LiveRegs; - LiveRegs.init(*MRI.getTargetRegisterInfo()); - return findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, + LivePhysRegs &LiveRegs, + Register &TempSGPR, + Optional<int> &FrameIndex, + bool IsFP) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + +#ifndef NDEBUG + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); +#endif + + // We need to save and restore the current FP/BP. + + // 1: If there is already a VGPR with free lanes, use it. We + // may already have to pay the penalty for spilling a CSR VGPR. + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, + TargetStackID::SGPRSpill); + + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + FrameIndex = NewFI; + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n'); + return; + } + + // 2: Next, try to save the FP/BP in an unused SGPR. + TempSGPR = findScratchNonCalleeSaveRegister( + MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); + + if (!TempSGPR) { + int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr, + TargetStackID::SGPRSpill); + + if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) { + // 3: There's no free lane to spill, and no free register to save FP/BP, + // so we're forced to spill another VGPR to use for the spill. + FrameIndex = NewFI; + } else { + // 4: If all else fails, spill the FP/BP to memory. + FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4)); + } + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " + << printReg(TempSGPR, TRI) << '\n'); + } } // We need to specially emit stack operations here because a different frame @@ -91,8 +132,8 @@ static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { // use. static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const SIInstrInfo *TII, unsigned SpillReg, - unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + const SIInstrInfo *TII, Register SpillReg, + Register ScratchRsrcReg, Register SPReg, int FI) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -100,7 +141,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, - MFI.getObjectAlignment(FI)); + MFI.getObjectAlign(FI)); if (isUInt<12>(Offset)) { BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) @@ -139,15 +180,15 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const SIInstrInfo *TII, unsigned SpillReg, - unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + const SIInstrInfo *TII, Register SpillReg, + Register ScratchRsrcReg, Register SPReg, int FI) { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); int64_t Offset = MFI.getObjectOffset(FI); MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, - MFI.getObjectAlignment(FI)); + MFI.getObjectAlign(FI)); if (isUInt<12>(Offset)) { BuildMI(MBB, I, DebugLoc(), @@ -184,11 +225,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addMemOperand(MMO); } -void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const { +// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` +void SIFrameLowering::emitEntryFunctionFlatScratchInit( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // We don't need this if we only have spills since there is no user facing @@ -201,11 +244,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); @@ -216,8 +254,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { @@ -266,19 +302,22 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, .addImm(8); } -unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch RSRC. +Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + assert(MFI->isEntryFunction()); + + Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - // We need to insert initialization of the scratch resource descriptor. - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister || - !MRI.isPhysRegUsed(ScratchRsrcReg)) - return AMDGPU::NoRegister; + if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) + return Register(); if (ST.hasSGPRInitBug() || ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) @@ -293,18 +332,19 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // cannot do this for the resources required for scratch access. For now we // skip over user SGPRs and may leave unused holes. - // We find the resource first because it has an alignment requirement. - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; - ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); + ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); // Skip the last N reserved elements because they should have already been // reserved for VCC etc. + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other - // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + // reserved input we needed. Also for PAL, make sure we don't clobber + // the GIT pointer passed in SGPR0 or SGPR8. + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -314,231 +354,138 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset. -std::pair<unsigned, bool> -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - - assert(MFI->isEntryFunction()); - - // No replacement necessary. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { - return std::make_pair(AMDGPU::NoRegister, false); - } - - if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, false); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); - if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, false); - - AllSGPRs = AllSGPRs.slice(NumPreloaded); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // + 1 if stack pointer is used. - // ---- - // 13 (+1) - unsigned ReservedRegCount = 13; - - if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, false); - - bool HandledScratchWaveOffsetReg = - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - bool FPAdjusted = false; - - for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - if (!HandledScratchWaveOffsetReg) { - HandledScratchWaveOffsetReg = true; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - ScratchWaveOffsetReg = Reg; - FPAdjusted = true; - break; - } - } - } - - return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - // If we only have SGPR spills, we won't actually be using scratch memory - // since these spill to VGPRs. - // - // FIXME: We should be cleaning up these unused SGPR spill frame indices - // somewhere. - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - - // We need to do the replacement of the private segment buffer and wave offset - // register even if there are no stack objects. There could be stores to undef - // or a constant without an associated object. + // FIXME: If we only have SGPR spills, we won't actually be using scratch + // memory since these spill to VGPRs. We should be cleaning up these unused + // SGPR spill frame indices somewhere. // FIXME: We still have implicit uses on SGPR spill instructions in case they // need to spill to vector memory. It's likely that will not happen, but at // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MFI->hasFlatScratchInit()) - emitFlatScratchInit(ST, MF, MBB); + // FIXME: Remove all of the isPhysRegUsed checks - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); - unsigned ScratchWaveOffsetReg; - bool FPAdjusted; - std::tie(ScratchWaveOffsetReg, FPAdjusted) = - getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + assert(MFI->isEntryFunction()); - // We need to insert initialization of the scratch resource descriptor. Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOrMesa(F)) { - PreloadedPrivateBufferReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - } - - bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchWaveOffsetReg); - bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchRsrcReg); - // FIXME: Hack to not crash in situations which emitted an error. - if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + if (!PreloadedScratchWaveOffsetReg) return; - // We added live-ins during argument lowering, but since they were not used - // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - - if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); - MRI.addLiveIn(PreloadedPrivateBufferReg); - MBB.addLiveIn(PreloadedPrivateBufferReg); + // We need to do the replacement of the private segment buffer register even + // if there are no stack objects. There could be stores to undef or a + // constant without an associated object. + // + // This will return `Register()` in cases where there are no actual + // uses of the SRSRC. + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + + // Make the selected register live throughout the function. + if (ScratchRsrcReg) { + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB != &MBB) { + OtherBB.addLiveIn(ScratchRsrcReg); + } + } } - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (OffsetRegUsed || FPAdjusted) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ResourceRegUsed) - OtherBB.addLiveIn(ScratchRsrcReg); + // Now that we have fixed the reserved SRSRC we need to locate the + // (potentially) preloaded SRSRC. + Register PreloadedScratchRsrcReg; + if (ST.isAmdHsaOrMesa(F)) { + PreloadedScratchRsrcReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg && PreloadedScratchRsrcReg) { + // We added live-ins during argument lowering, but since they were not + // used they were deleted. We're adding the uses now, so add them back. + MRI.addLiveIn(PreloadedScratchRsrcReg); + MBB.addLiveIn(PreloadedScratchRsrcReg); + } } + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. - - bool CopyBuffer = ResourceRegUsed && - PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != PreloadedPrivateBufferReg; - - // This needs to be careful of the copying order to avoid overwriting one of - // the input registers before it's been copied to it's final - // destination. Usually the offset should be copied first. - bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, - ScratchWaveOffsetReg); - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + // We found the SRSRC first because it needs four registers and has an + // alignment requirement. If the SRSRC that we found is clobbering with + // the scratch wave offset, which may be in a fixed SGPR or a free SGPR + // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch + // wave offset to a free SGPR. + Register ScratchWaveOffsetReg; + if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) { + ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + AllSGPRs = AllSGPRs.slice( + std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded)); + Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); + for (MCPhysReg Reg : AllSGPRs) { + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) { + ScratchWaveOffsetReg = Reg; + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + break; + } + } + } else { + ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg; } + assert(ScratchWaveOffsetReg); - unsigned SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // FIXME: Remove the isPhysRegUsed checks - const bool HasFP = hasFP(MF); - - if (HasFP || OffsetRegUsed) { - assert(ScratchWaveOffsetReg); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + if (MF.getFrameInfo().hasCalls()) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) + .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); } - if (CopyBuffer && !CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (ResourceRegUsed) { - emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, - PreloadedPrivateBufferReg, ScratchRsrcReg); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); } - if (HasFP) { - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); + } - // On kernel entry, the private scratch wave offset is the SP value. - if (StackSize == 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } + if (ScratchRsrcReg) { + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } -// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, - MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const { +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` +void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const Function &Fn = MF.getFunction(); - DebugLoc DL; if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either @@ -557,19 +504,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } - auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in - if (ST.hasMergedShaders()) { - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - // Low GIT address is passed in s8 rather than s0 for an LS+HS or - // ES+GS merged shader on gfx9+. - GitPtrLo = AMDGPU::SGPR8; - break; - default: - break; - } - } + Register GitPtrLo = MFI->getGITPtrLoReg(MF); MF.getRegInfo().addLiveIn(GitPtrLo); MBB.addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) @@ -582,12 +517,12 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | - MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable, - 16, 4); + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 16, Align(4)); unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); - unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset); + unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset); BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) .addImm(EncodedOffset) // offset @@ -595,10 +530,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); - return; - } - if (ST.isMesaGfxShader(Fn) - || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -621,11 +553,11 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); - auto MMO = MF.getMachineMemOperand(PtrInfo, - MachineMemOperand::MOLoad | - MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable, - 8, 4); + auto MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 8, Align(4)); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset @@ -658,7 +590,37 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, BuildMI(MBB, I, DL, SMovB32, Rsrc3) .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else if (ST.isAmdHsaOrMesa(Fn)) { + assert(PreloadedScratchRsrcReg); + + if (ScratchRsrcReg != PreloadedScratchRsrcReg) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); + } } + + // Add the scratch wave offset into the scratch RSRC. + // + // We only want to update the first 48 bits, which is the base address + // pointer, without touching the adjacent 16 bits of flags. We know this add + // cannot carry-out from bit 47, otherwise the scratch allocation would be + // impossible to fit in the 48-bit global address space. + // + // TODO: Evaluate if it is better to just construct an SRD using the flat + // scratch init and some constants rather than update the one we are passed. + Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in + // the kernel body via inreg arguments. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) + .addReg(ScratchRsrcSub0) + .addReg(ScratchWaveOffsetReg) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { @@ -673,6 +635,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { llvm_unreachable("Invalid TargetStackID::Value"); } +// Activate all lanes, returns saved exec. +static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool IsProlog) { + Register ScratchExecCopy; + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + DebugLoc DL; + + if (LiveRegs.empty()) { + if (IsProlog) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); + } else { + // In epilog. + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } + } + + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + + if (!IsProlog) + LiveRegs.removeReg(ScratchExecCopy); + + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1); + + return ScratchExecCopy; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -687,51 +693,81 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; bool HasFP = false; + bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; // To avoid clobbering VGPRs in lanes that weren't active on function entry, // turn on all lanes before doing the spill to memory. - unsigned ScratchExecCopy = AMDGPU::NoRegister; + Register ScratchExecCopy; + + bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); + bool SpillFPToMemory = false; + // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. + // Otherwise we are spilling the FP to memory. + if (HasFPSaveIndex) { + SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != + TargetStackID::SGPRSpill; + } + + bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); + bool SpillBPToMemory = false; + // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR. + // Otherwise we are spilling the BP to memory. + if (HasBPSaveIndex) { + SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != + TargetStackID::SGPRSpill; + } // Emit the copy if we need an FP, and are using a free SGPR to save it. - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + if (FuncInfo->SGPRForFPSaveRestoreCopy) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) .addReg(FramePtrReg) .setMIFlag(MachineInstr::FrameSetup); } + // Emit the copy if we need a BP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), + FuncInfo->SGPRForBPSaveRestoreCopy) + .addReg(BasePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If a copy has been emitted for FP and/or BP, Make the SGPRs + // used in the copy instructions live throughout the function. + SmallVector<MCPhysReg, 2> TempSGPRs; + if (FuncInfo->SGPRForFPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); + + if (!TempSGPRs.empty()) { + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : TempSGPRs) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } + } + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - if (ScratchExecCopy == AMDGPU::NoRegister) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - if (FuncInfo->SGPRForFPSaveRestoreCopy) - LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); - } - - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MRI, LiveRegs, - *TRI.getWaveMaskRegClass()); - assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); - - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), - ScratchExecCopy) - .addImm(-1); - } + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, FuncInfo->getScratchRSrcReg(), @@ -739,84 +775,153 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (HasFPSaveIndex && SpillFPToMemory) { + assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue())); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(FramePtrReg); + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + FuncInfo->FramePointerSaveIndex.getValue()); + } + + if (HasBPSaveIndex && SpillBPToMemory) { + assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex)); + + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true); + + MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) + .addReg(BasePtrReg); + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, + *FuncInfo->BasePointerSaveIndex); + } + + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + .addReg(ScratchExecCopy, RegState::Kill); LiveRegs.addReg(ScratchExecCopy); } - - if (FuncInfo->FramePointerSaveIndex) { + // In this case, spill the FP to a reserved VGPR. + if (HasFPSaveIndex && !SpillFPToMemory) { const int FI = FuncInfo->FramePointerSaveIndex.getValue(); - assert(!MFI.isDeadObjectIndex(FI) && - MFI.getStackID(FI) == TargetStackID::SGPRSpill); - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill - = FuncInfo->getSGPRToVGPRSpills(FI); + assert(!MFI.isDeadObjectIndex(FI)); + + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); assert(Spill.size() == 1); // Save FP before setting it up. // FIXME: This should respect spillSGPRToVGPR; BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill[0].VGPR) - .addReg(FramePtrReg) - .addImm(Spill[0].Lane) - .addReg(Spill[0].VGPR, RegState::Undef); + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } + + // In this case, spill the BP to a reserved VGPR. + if (HasBPSaveIndex && !SpillBPToMemory) { + const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); + + assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + + // Save BP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(BasePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); } if (TRI.needsStackRealignment(MF)) { HasFP = true; - const unsigned Alignment = MFI.getMaxAlignment(); + const unsigned Alignment = MFI.getMaxAlign().value(); RoundedSize += Alignment; if (LiveRegs.empty()) { LiveRegs.init(TRI); LiveRegs.addLiveIns(MBB); LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); } - unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + Register ScratchSPReg = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg != AMDGPU::NoRegister && - ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); + assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy && + ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) - .addReg(StackPtrReg) - .addImm((Alignment - 1) * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) - .addReg(ScratchSPReg, RegState::Kill) - .addImm(-Alignment * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { - // If we need a base pointer, set it up here. It's whatever the value of - // the stack pointer is at this point. Any variable size objects will be - // allocated after this, so we can still use the base pointer to reference - // locals. BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) - .addReg(StackPtrReg) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If we need a base pointer, set it up here. It's whatever the value of + // the stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // the incoming arguments. + if ((HasBP = TRI.hasBasePointer(MF))) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); } if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); } - assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy || FuncInfo->FramePointerSaveIndex)) && "Needed to save FP but didn't save it anywhere"); - assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && !FuncInfo->FramePointerSaveIndex)) && "Saved FP but didn't need it"); + + assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || + FuncInfo->BasePointerSaveIndex)) && + "Needed to save BP but didn't save it anywhere"); + + assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && + !FuncInfo->BasePointerSaveIndex)) && + "Saved BP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -828,81 +933,126 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); LivePhysRegs LiveRegs; DebugLoc DL; const MachineFrameInfo &MFI = MF.getFrameInfo(); uint32_t NumBytes = MFI.getStackSize(); - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; + uint32_t RoundedSize = FuncInfo->isStackRealigned() + ? NumBytes + MFI.getMaxAlign().value() + : NumBytes; + const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + const Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + const Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); + + bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue(); + bool SpillFPToMemory = false; + if (HasFPSaveIndex) { + SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) != + TargetStackID::SGPRSpill; + } + + bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue(); + bool SpillBPToMemory = false; + if (HasBPSaveIndex) { + SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) != + TargetStackID::SGPRSpill; + } if (RoundedSize != 0 && hasFP(MF)) { - const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameDestroy); } - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) - .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) - .setMIFlag(MachineInstr::FrameSetup); + if (FuncInfo->SGPRForFPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); } - if (FuncInfo->FramePointerSaveIndex) { - const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg) + .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } - assert(!MF.getFrameInfo().isDeadObjectIndex(FI) && - MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill); + Register ScratchExecCopy; + if (HasFPSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI)); + if (SpillFPToMemory) { + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + + MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, FI); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg) + .addReg(TempVGPR, RegState::Kill); + } else { + // Reload from VGPR spill. + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FramePtrReg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + } - ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill - = FuncInfo->getSGPRToVGPRSpills(FI); - assert(Spill.size() == 1); - BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - FuncInfo->getFrameOffsetReg()) - .addReg(Spill[0].VGPR) - .addImm(Spill[0].Lane); + if (HasBPSaveIndex) { + const int BasePtrFI = *FuncInfo->BasePointerSaveIndex; + assert(!MFI.isDeadObjectIndex(BasePtrFI)); + if (SpillBPToMemory) { + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); + + MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR, + FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg) + .addReg(TempVGPR, RegState::Kill); + } else { + // Reload from VGPR spill. + assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + BasePtrReg) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } } - unsigned ScratchExecCopy = AMDGPU::NoRegister; - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : + FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (ScratchExecCopy == AMDGPU::NoRegister) { - // See emitPrologue - if (LiveRegs.empty()) { - LiveRegs.init(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); - } - - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, *TRI.getWaveMaskRegClass()); - LiveRegs.removeReg(ScratchExecCopy); - - const unsigned OrSaveExec = - ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) - .addImm(-1); - } + if (!ScratchExecCopy) + ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false); buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, - FuncInfo->getScratchRSrcReg(), - FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); + FuncInfo->getScratchRSrcReg(), StackPtrReg, + Reg.FI.getValue()); } - if (ScratchExecCopy != AMDGPU::NoRegister) { + if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy, RegState::Kill); + .addReg(ScratchExecCopy, RegState::Kill); } } @@ -920,12 +1070,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, - Optional<int> FramePointerSaveIndex) { + Optional<int> FramePointerSaveIndex, + Optional<int> BasePointerSaveIndex) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && MFI.getStackID(I) == TargetStackID::SGPRSpill && - FramePointerSaveIndex && I != FramePointerSaveIndex) { + ((FramePointerSaveIndex && I != FramePointerSaveIndex) || + (BasePointerSaveIndex && I != BasePointerSaveIndex))) { return false; } } @@ -935,7 +1087,7 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, #endif int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); FrameReg = RI->getFrameRegister(MF); @@ -952,7 +1104,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); FuncInfo->removeDeadFrameIndices(MFI); - assert(allSGPRSpillsAreDead(MFI, None) && + assert(allSGPRSpillsAreDead(MFI, None, None) && "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, @@ -967,9 +1119,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RS->addScavengingFrameIndex(ScavengeFI); } else { int ScavengeFI = MFI.CreateStackObject( - TRI->getSpillSize(AMDGPU::SGPR_32RegClass), - TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), - false); + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false); RS->addScavengingFrameIndex(ScavengeFI); } } @@ -984,7 +1135,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isEntryFunction()) return; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1008,46 +1159,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, for (auto SSpill : MFI->getSGPRSpillVGPRs()) SavedVGPRs.reset(SSpill.VGPR); - const bool HasFP = WillHaveFP || hasFP(MF); - if (!HasFP) - return; - - if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - - // If there is already a VGPR with free lanes, use it. We may already have - // to pay the penalty for spilling a CSR VGPR. - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - - MFI->FramePointerSaveIndex = NewFI; + LivePhysRegs LiveRegs; + LiveRegs.init(*TRI); - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n'); - return; + if (WillHaveFP || hasFP(MF)) { + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, + MFI->FramePointerSaveIndex, true); } - MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); - - if (!MFI->SGPRForFPSaveRestoreCopy) { - // There's no free lane to spill, and no free register to save FP, so we're - // forced to spill another VGPR to use for the spill. - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - MFI->FramePointerSaveIndex = NewFI; - - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n';); - } else { - LLVM_DEBUG(dbgs() << "Saving FP with copy to " << - printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + if (TRI->hasBasePointer(MF)) { + if (MFI->SGPRForFPSaveRestoreCopy) + LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, + MFI->BasePointerSaveIndex, false); } } @@ -1074,14 +1198,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots( return true; // Early exit if no callee saved registers are modified! const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - if (!FuncInfo->SGPRForFPSaveRestoreCopy) + if (!FuncInfo->SGPRForFPSaveRestoreCopy && + !FuncInfo->SGPRForBPSaveRestoreCopy) return false; + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = RI->getBaseRegister(); + unsigned NumModifiedRegs = 0; + + if (FuncInfo->SGPRForFPSaveRestoreCopy) + NumModifiedRegs++; + if (FuncInfo->SGPRForBPSaveRestoreCopy) + NumModifiedRegs++; + for (auto &CS : CSI) { - if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) - CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); - break; + if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + if (--NumModifiedRegs) + break; + } else if (CS.getReg() == BasePtrReg && + FuncInfo->SGPRForBPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); + if (--NumModifiedRegs) + break; } } @@ -1104,12 +1245,10 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; if (!hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - - Amount = alignTo(Amount, Align); + Amount = alignTo(Amount, getStackAlign()); assert(isUInt<32>(Amount) && "exceeded stack address space size"); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned SPReg = MFI->getStackPtrOffsetReg(); + Register SPReg = MFI->getStackPtrOffsetReg(); unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; BuildMI(MBB, I, DL, TII->get(Op), SPReg) @@ -1124,19 +1263,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasCalls()) { + + // For entry functions we can use an immediate offset in most cases, so the + // presence of calls doesn't imply we need a distinct frame pointer. + if (MFI.hasCalls() && + !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. // FIXME: This function is pretty broken, since it can be called before the // frame layout is determined or CSR spills are inserted. - if (MFI.getStackSize() != 0) - return true; - - // For the entry point, the input wave scratch offset must be copied to the - // API SP if there are calls. - if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) - return true; + return MFI.getStackSize() != 0; } return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index d9970fd6b4b87..e894320406610 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -21,7 +21,7 @@ class GCNSubtarget; class SIFrameLowering final : public AMDGPUFrameLowering { public: SIFrameLowering(StackDirection D, Align StackAl, int LAO, - Align TransAl = Align::None()) + Align TransAl = Align(1)) : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; @@ -32,7 +32,7 @@ public: void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; @@ -55,26 +55,19 @@ public: MachineBasicBlock::iterator MI) const override; private: - void emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const; - - unsigned getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - - // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. - void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, - MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; + + Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; + + void emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e73d87cd66afa..d035aa8f72bd7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11,11 +11,6 @@ // //===----------------------------------------------------------------------===// -#if defined(_MSC_VER) || defined(__MINGW32__) -// Provide M_PI. -#define _USE_MATH_DEFINES -#endif - #include "SIISelLowering.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" @@ -40,6 +35,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -95,14 +91,24 @@ static cl::opt<bool> DisableLoopAlignment( cl::desc("Do not align and prefetch loops"), cl::init(false)); +static cl::opt<bool> VGPRReserveforSGPRSpill( + "amdgpu-reserve-vgpr-for-sgpr-spill", + cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); + +static cl::opt<bool> UseDivergentRegisterIndexing( + "amdgpu-use-divergent-register-indexing", + cl::Hidden, + cl::desc("Use indirect register addressing for divergent indexes"), + cl::init(false)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().FP32Denormals; + return Info->getMode().allFP32Denormals(); } static bool hasFP64FP16Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().FP64FP16Denormals; + return Info->getMode().allFP64FP16Denormals(); } static unsigned findFirstFreeSGPR(CCState &CCInfo) { @@ -141,12 +147,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); - addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); + addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + + addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); + addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); @@ -158,10 +173,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } - if (Subtarget->hasMAIInsts()) { - addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); - } + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -202,6 +215,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); + + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -224,6 +248,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -260,7 +290,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // with > 4 elements. for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, - MVT::v32i32, MVT::v32f32 }) { + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -304,6 +335,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } + for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); + } + + for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); + } + + for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); + } + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -361,9 +434,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); } - setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + // FIXME: This should be narrowed to i32, but that only happens if i64 is + // illegal. + // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. + setOperationAction(ISD::BSWAP, MVT::i64, Legal); + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); setOperationAction(ISD::TRAP, MVT::Other, Custom); @@ -376,10 +454,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals. We report it as unconditionally - // legal, and the context where it is formed will disallow it when fp32 - // denormals are enabled. - setOperationAction(ISD::FMAD, MVT::f32, Legal); + if (Subtarget->hasMadMacF32Insts()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -463,7 +539,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SREM, MVT::i16, Promote); setOperationAction(ISD::UREM, MVT::i16, Promote); - setOperationAction(ISD::BSWAP, MVT::i16, Promote); setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); setOperationAction(ISD::CTTZ, MVT::i16, Promote); @@ -499,8 +574,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP1 Actions. setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Custom); + setOperationAction(ISD::FSIN, MVT::f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); @@ -545,6 +620,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + // v_perm_b32 can handle either of these. + setOperationAction(ISD::BSWAP, MVT::i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v2i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); + // XXX - Do these do anything? Vector constants turn into build_vector. setOperationAction(ISD::Constant, MVT::v2i16, Legal); setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); @@ -686,6 +766,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); @@ -762,6 +845,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); + // FIXME: In other contexts we pretend this is a per-function property. + setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); + setSchedulingPreference(Sched::RegPressure); } @@ -783,6 +869,7 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && DestVT.getScalarType() == MVT::f32 && SrcVT.getScalarType() == MVT::f16 && + // TODO: This probably only requires no input flushing? !hasFP32Denormals(DAG.getMachineFunction()); } @@ -877,45 +964,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static MVT memVTFromAggregate(Type *Ty) { - // Only limited forms of aggregate type currently expected. - assert(Ty->isStructTy() && "Expected struct type"); - +static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) { + assert(DMaskLanes != 0); - Type *ElementType = nullptr; - unsigned NumElts; - if (Ty->getContainedType(0)->isVectorTy()) { - VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); - ElementType = VecComponent->getElementType(); - NumElts = VecComponent->getNumElements(); - } else { - ElementType = Ty->getContainedType(0); - NumElts = 1; + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + unsigned NumElts = std::min(DMaskLanes, VT->getNumElements()); + return EVT::getVectorVT(Ty->getContext(), + EVT::getEVT(VT->getElementType()), + NumElts); } - assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type"); + return EVT::getEVT(Ty); +} - // Calculate the size of the memVT type from the aggregate - unsigned Pow2Elts = 0; - unsigned ElementSize; - switch (ElementType->getTypeID()) { - default: - llvm_unreachable("Unknown type!"); - case Type::IntegerTyID: - ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); - break; - case Type::HalfTyID: - ElementSize = 16; - break; - case Type::FloatTyID: - ElementSize = 32; - break; - } - unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; - Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); +// Peek through TFE struct returns to only use the data size. +static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) { + auto *ST = dyn_cast<StructType>(Ty); + if (!ST) + return memVTFromImageData(Ty, DMaskLanes); - return MVT::getVectorVT(MVT::getVT(ElementType, false), - Pow2Elts); + // Some intrinsics return an aggregate type - special case to work out the + // correct memVT. + // + // Only limited forms of aggregate type currently expected. + if (ST->getNumContainedTypes() != 2 || + !ST->getContainedType(1)->isIntegerTy(32)) + return EVT(); + return memVTFromImageData(ST->getContainedType(0), DMaskLanes); } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -944,17 +1019,40 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + unsigned DMaskLanes = 4; + + if (RsrcIntr->IsImage) { + const AMDGPU::ImageDimIntrinsicInfo *Intr + = AMDGPU::getImageDimIntrinsicInfo(IntrID); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + + if (!BaseOpcode->Gather4) { + // If this isn't a gather, we may have excess loaded elements in the + // IR type. Check the dmask for the real number of elements loaded. + unsigned DMask + = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); + DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); + } + + Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes); + } else + Info.memVT = EVT::getEVT(CI.getType()); + + // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType(), true); - if (Info.memVT == MVT::Other) { - // Some intrinsics return an aggregate type - special case to work out - // the correct memVT - Info.memVT = memVTFromAggregate(CI.getType()); - } Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + + Type *DataTy = CI.getArgOperand(0)->getType(); + if (RsrcIntr->IsImage) { + unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); + unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); + Info.memVT = memVTFromImageData(DataTy, DMaskLanes); + } else + Info.memVT = EVT::getEVT(DataTy); + Info.flags |= MachineMemOperand::MOStore; } else { // Atomic @@ -1031,6 +1129,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_global_atomic_csub: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1226,9 +1335,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // addressing modes, so treat them as having no offset like flat // instructions. return isLegalFlatAddressingMode(AM); - } else { - llvm_unreachable("unhandled address space"); } + + // Assume a user alias of global for unknown address spaces. + return isLegalGlobalAddressingMode(AM); } bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, @@ -1279,9 +1389,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { + // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so + // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? - (Align % 4 == 0) : true; + Align >= 4 : Align != 2; } return true; @@ -1320,18 +1432,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } EVT SITargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to // use. Make sure we switch these to 64-bit accesses. - if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + if (Op.size() >= 16 && + Op.isDstAligned(Align(4))) // XXX: Should only do for global return MVT::v4i32; - if (Size >= 8 && DstAlign >= 4) + if (Op.size() >= 8 && Op.isDstAligned(Align(4))) return MVT::v2i32; // Use the default. @@ -1416,9 +1527,10 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, const ArgDescriptor *InputPtrReg; const TargetRegisterClass *RC; + LLT ArgTy; - std::tie(InputPtrReg, RC) - = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + std::tie(InputPtrReg, RC, ArgTy) = + Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); @@ -1457,7 +1569,7 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, } if (MemVT.isFloatingPoint()) - Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); + Val = getFPExtOrFPRound(DAG, Val, SL, VT); else if (Signed) Val = DAG.getSExtOrTrunc(Val, SL, VT); else @@ -1467,16 +1579,15 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, } SDValue SITargetLowering::lowerKernargMemParameter( - SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, - const ISD::InputArg *Arg) const { + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, + uint64_t Offset, Align Alignment, bool Signed, + const ISD::InputArg *Arg) const { MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with // the previous argument. - if (MemVT.getStoreSize() < 4 && Align < 4) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). int64_t AlignDownOffset = alignDown(Offset, 4); int64_t OffsetDiff = Offset - AlignDownOffset; @@ -1502,9 +1613,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); @@ -1565,8 +1676,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, AMDGPUFunctionArgInfo::PreloadedValue PVID) const { const ArgDescriptor *Reg; const TargetRegisterClass *RC; + LLT Ty; - std::tie(Reg, RC) = MFI.getPreloadedValue(PVID); + std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); } @@ -1666,7 +1778,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); if (RegIdx == ArgVGPRs.size()) { // Spill to stack required. - int64_t Offset = CCInfo.AllocateStack(4, 4); + int64_t Offset = CCInfo.AllocateStack(4, Align(4)); return ArgDescriptor::createStack(Offset, Mask); } @@ -1706,10 +1818,11 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) const { +/// Allocate implicit function VGPR arguments at the end of allocated user +/// arguments. +void SITargetLowering::allocateSpecialInputVGPRs( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1727,6 +1840,20 @@ void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } +/// Allocate implicit function VGPR arguments in fixed registers. +void SITargetLowering::allocateSpecialInputVGPRsFixed( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); + if (!Reg) + report_fatal_error("failed to allocated VGPR for implicit arguments"); + + const unsigned Mask = 0x3ff; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); +} + void SITargetLowering::allocateSpecialInputSGPRs( CCState &CCInfo, MachineFunction &MF, @@ -1742,8 +1869,10 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasQueuePtr()) ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); - if (Info.hasKernargSegmentPtr()) - ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + // Implicit arg ptr takes the place of the kernarg segment pointer. This is a + // constant offset from the kernarg segment. + if (Info.hasImplicitArgPtr()) + ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); if (Info.hasDispatchID()) ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); @@ -1758,9 +1887,6 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasWorkGroupIDZ()) ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); - - if (Info.hasImplicitArgPtr()) - ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); } // Allocate special inputs passed in user SGPRs. @@ -1916,67 +2042,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchRSrcReg(ReservedBufferReg); } - // hasFP should be accurate for kernels even before the frame is finalized. - if (ST.getFrameLowering()->hasFP(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - // Try to use s32 as the SP, but move it if it would interfere with input - // arguments. This won't work with calls though. - // - // FIXME: Move SP to avoid any possible inputs, or find a way to spill input - // registers. - if (!MRI.isLiveIn(AMDGPU::SGPR32)) { - Info.setStackPtrOffsetReg(AMDGPU::SGPR32); - } else { - assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + // For entry functions we have to set up the stack pointer if we use it, + // whereas non-entry functions get this "for free". This means there is no + // intrinsic advantage to using S32 over S34 in cases where we do not have + // calls but do need a frame pointer (i.e. if we are requested to have one + // because frame pointer elimination is disabled). To keep things simple we + // only ever use S32 as the call ABI stack pointer, and so using it does not + // imply we need a separate frame pointer. + // + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); + } else { + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); - if (MFI.hasCalls()) - report_fatal_error("call in graphics shader with too many input SGPRs"); + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); - for (unsigned Reg : AMDGPU::SGPR_32RegClass) { - if (!MRI.isLiveIn(Reg)) { - Info.setStackPtrOffsetReg(Reg); - break; - } + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; } - - if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) - report_fatal_error("failed to find register for SP"); } - if (MFI.hasCalls()) { - Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); - Info.setFrameOffsetReg(AMDGPU::SGPR33); - } else { - unsigned ReservedOffsetReg = - TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); - } - } else if (RequiresStackAccess) { - assert(!MFI.hasCalls()); - // We know there are accesses and they will be done relative to SP, so just - // pin it to the input. - // - // FIXME: Should not do this if inline asm is reading/writing these - // registers. - Register PreloadedSP = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - Info.setStackPtrOffsetReg(PreloadedSP); - Info.setScratchWaveOffsetReg(PreloadedSP); - Info.setFrameOffsetReg(PreloadedSP); - } else { - assert(!MFI.hasCalls()); + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } - // There may not be stack access at all. There may still be spills, or - // access of a constant pointer (in which cases an extra copy will be - // emitted in the prolog). - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedOffsetReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); + // hasFP should be accurate for entry functions even before the frame is + // finalized, because it does not rely on the known stack size, only + // properties like whether variable sized objects are present. + if (ST.getFrameLowering()->hasFP(MF)) { + Info.setFrameOffsetReg(AMDGPU::SGPR33); } } @@ -2110,6 +2214,10 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + } else { + // For the fixed ABI, pass workitem IDs in the last argument register. + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } if (IsKernel) { @@ -2126,9 +2234,9 @@ SDValue SITargetLowering::LowerFormalArguments( // // FIXME: Alignment of explicit arguments totally broken with non-0 explicit // kern arg offset. - const unsigned KernelArgBaseAlign = 16; + const Align KernelArgBaseAlign = Align(16); - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -2143,10 +2251,11 @@ SDValue SITargetLowering::LowerFormalArguments( EVT MemVT = VA.getLocVT(); const uint64_t Offset = VA.getLocMemOffset(); - unsigned Align = MinAlign(KernelArgBaseAlign, Offset); + Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); + SDValue Arg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, + Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -2221,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (!IsEntryFunc) { + if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { // Special inputs come after user arguments. allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); } @@ -2231,8 +2340,6 @@ SDValue SITargetLowering::LowerFormalArguments( allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -2442,50 +2549,51 @@ void SITargetLowering::passSpecialInputs( SDValue Chain) const { // If we don't have a call site, this was a call inserted by // legalization. These can never use special inputs. - if (!CLI.CS) + if (!CLI.CB) return; - const Function *CalleeFunc = CLI.CS.getCalledFunction(); - assert(CalleeFunc); - SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - - auto &ArgUsageInfo = - DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - const AMDGPUFunctionArgInfo &CalleeArgInfo - = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); - const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + const AMDGPUFunctionArgInfo *CalleeArgInfo + = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; + if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); + CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + } + // TODO: Unify with private memory register handling. This is complicated by // the fact that at least in kernels, the input argument is not necessarily // in the same location as the input. AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { AMDGPUFunctionArgInfo::DISPATCH_PTR, AMDGPUFunctionArgInfo::QUEUE_PTR, - AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, AMDGPUFunctionArgInfo::DISPATCH_ID, AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z }; for (auto InputID : InputRegs) { const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; + LLT ArgTy; - std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); + std::tie(OutgoingArg, ArgRC, ArgTy) = + CalleeArgInfo->getPreloadedValue(InputID); if (!OutgoingArg) continue; const ArgDescriptor *IncomingArg; const TargetRegisterClass *IncomingArgRC; - std::tie(IncomingArg, IncomingArgRC) - = CallerArgInfo.getPreloadedValue(InputID); + LLT Ty; + std::tie(IncomingArg, IncomingArgRC, Ty) = + CallerArgInfo.getPreloadedValue(InputID); assert(IncomingArgRC == ArgRC); // All special arguments are ints for now. @@ -2503,8 +2611,11 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) + report_fatal_error("failed to allocate implicit input argument"); } else { - unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); + unsigned SpecialArgOffset = + CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4)); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); MemOpChains.push_back(ArgStore); @@ -2515,33 +2626,34 @@ void SITargetLowering::passSpecialInputs( // packed. const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; + LLT Ty; - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); if (!OutgoingArg) - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); if (!OutgoingArg) - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); if (!OutgoingArg) return; - const ArgDescriptor *IncomingArgX - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; - const ArgDescriptor *IncomingArgY - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; - const ArgDescriptor *IncomingArgZ - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + const ArgDescriptor *IncomingArgX = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); + const ArgDescriptor *IncomingArgY = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); + const ArgDescriptor *IncomingArgZ = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); SDValue InputReg; SDLoc SL; // If incoming ids are not packed we need to pack them. - if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); - if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2549,7 +2661,7 @@ void SITargetLowering::passSpecialInputs( DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; } - if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2569,8 +2681,9 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + CCInfo.AllocateReg(OutgoingArg->getRegister()); } else { - unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); MemOpChains.push_back(ArgStore); @@ -2703,10 +2816,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call to variadic function "); } - if (!CLI.CS.getInstruction()) + if (!CLI.CB) report_fatal_error("unsupported libcall legalization"); - if (!CLI.CS.getCalledFunction()) { + if (!AMDGPUTargetMachine::EnableFixedFunctionABI && + !CLI.CB->getCalledFunction()) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); } @@ -2726,7 +2840,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { + if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); } @@ -2743,12 +2857,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + // With a fixed ABI, allocate fixed registers before user arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2767,7 +2888,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -2784,7 +2904,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getTokenFactor(DL, CopyFromChains); } - SmallVector<SDValue, 8> MemOpChains; MVT PtrVT = MVT::i32; // Walk the register/memloc assignments, inserting copies/loads. @@ -2837,7 +2956,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // FIXME: We can have better than the minimum byval required alignment. Alignment = Flags.isByVal() - ? MaybeAlign(Flags.getByValAlign()) + ? Flags.getNonZeroByValAlign() : commonAlignment(Subtarget->getStackAlignment(), Offset); Offset = Offset + FPDiff; @@ -2864,11 +2983,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); - SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVol = */ false, /*AlwaysInline = */ true, - /*isTailCall = */ false, DstInfo, - MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); + SDValue Cpy = + DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, + Outs[i].Flags.getNonZeroByValAlign(), + /*isVol = */ false, /*AlwaysInline = */ true, + /*isTailCall = */ false, DstInfo, + MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); MemOpChains.push_back(Cpy); } else { @@ -2879,8 +2999,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - // Copy special input registers after user input arguments. - passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -2927,9 +3049,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(Callee); // Add a redundant copy of the callee global which will not be legalized, as // we need direct access to the callee later. - GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee); - const GlobalValue *GV = GSD->getGlobal(); - Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + } else { + Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); + } if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -2985,6 +3110,71 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } +// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, +// except for applying the wave size scale to the increment amount. +SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( + SDValue Op, SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDValue Tmp1 = Op; + SDValue Tmp2 = Op.getValue(1); + SDValue Tmp3 = Op.getOperand(2); + SDValue Chain = Tmp1.getOperand(0); + + Register SPReg = Info->getStackPtrOffsetReg(); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + + SDValue Size = Tmp2.getOperand(1); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const TargetFrameLowering *TFL = ST.getFrameLowering(); + unsigned Opc = + TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? + ISD::ADD : ISD::SUB; + + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); + + Align StackAlign = TFL->getStackAlign(); + Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value + if (Alignment && *Alignment > StackAlign) { + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Alignment->value() + << ST.getWavefrontSizeLog2(), + dl, VT)); + } + + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + Tmp2 = DAG.getCALLSEQ_END( + Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + return DAG.getMergeValues({Tmp1, Tmp2}, dl); +} + +SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + // We only handle constant sizes here to allow non-entry block, static sized + // allocas. A truly dynamic value is more difficult to support because we + // don't know if the size value is uniform or not. If the size isn't uniform, + // we would need to do a wave reduction to get the maximum size to know how + // much to increment the uniform stack pointer. + SDValue Size = Op.getOperand(1); + if (isa<ConstantSDNode>(Size)) + return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. + + return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -3310,9 +3500,15 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, InitResultReg, DstReg, PhiReg, TmpExec, Offset, UseGPRIdxMode, IsIndirectSrc); - - MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec) + MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(LoopBB); + ++MBBI; + MF->insert(MBBI, LandingPad); + LoopBB->removeSuccessor(RemainderBB); + LandingPad->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LandingPad); + MachineBasicBlock::iterator First = LandingPad->begin(); + BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) .addReg(SaveExec); return InsPt; @@ -3331,7 +3527,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, if (Offset >= NumElts || Offset < 0) return std::make_pair(AMDGPU::sub0, Offset); - return std::make_pair(AMDGPU::sub0 + Offset, 0); + return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); } // Return true if the index is an SGPR and was set. @@ -3465,24 +3661,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return LoopBB; } -static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, - const TargetRegisterClass *VecRC) { - switch (TRI.getRegSizeInBits(*VecRC)) { - case 32: // 4 bytes - return AMDGPU::V_MOVRELD_B32_V1; - case 64: // 8 bytes - return AMDGPU::V_MOVRELD_B32_V2; - case 128: // 16 bytes - return AMDGPU::V_MOVRELD_B32_V4; - case 256: // 32 bytes - return AMDGPU::V_MOVRELD_B32_V8; - case 512: // 64 bytes - return AMDGPU::V_MOVRELD_B32_V16; - default: - llvm_unreachable("unsupported size for MOVRELD pseudos"); - } -} - static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST) { @@ -3522,28 +3700,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return &MBB; } + const MCInstrDesc &MovRelDesc + = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); - - if (UseGPRIdxMode) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst - .add(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(SrcVec->getReg(), RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); - + BuildMI(MBB, I, DL, MovRelDesc, Dst) + .addReg(SrcVec->getReg()) + .add(*Val) + .addImm(SubReg); + if (UseGPRIdxMode) BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); - - BuildMI(MBB, I, DL, MovRelDesc) - .addReg(Dst, RegState::Define) - .addReg(SrcVec->getReg()) - .add(*Val) - .addImm(SubReg - AMDGPU::sub0); - } MI.eraseFromParent(); return &MBB; @@ -3560,26 +3728,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, Offset, UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); - if (UseGPRIdxMode) { - BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(PhiReg, RegState::Undef, SubReg) // vdst - .add(*Val) // src0 - .addReg(Dst, RegState::ImplicitDefine) - .addReg(PhiReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) + .addReg(PhiReg) + .add(*Val) + .addImm(AMDGPU::sub0); + if (UseGPRIdxMode) BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); - - BuildMI(*LoopBB, InsPt, DL, MovRelDesc) - .addReg(Dst, RegState::Define) - .addReg(PhiReg) - .add(*Val) - .addImm(SubReg - AMDGPU::sub0); - } MI.eraseFromParent(); - return LoopBB; } @@ -3590,17 +3746,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineFunction *MF = BB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - if (TII->isMIMG(MI)) { - if (MI.memoperands_empty() && MI.mayLoadOrStore()) { - report_fatal_error("missing mem operand from MIMG instruction"); - } - // Add a memoperand for mimg instructions so that they aren't assumed to - // be ordered memory instuctions. + switch (MI.getOpcode()) { + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest0 = MI.getOperand(0); + MachineOperand &Dest1 = MI.getOperand(1); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + + unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::S_ADD_I32 + : AMDGPU::S_SUB_I32; + BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) + .addImm(1) + .addImm(0); + MI.eraseFromParent(); return BB; } - - switch (MI.getOpcode()) { case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); @@ -3616,35 +3782,150 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) - .add(Src0Sub0) - .add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) - .add(Src0Sub1) - .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::V_ADD_U64_PSEUDO: + case AMDGPU::V_SUB_U64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); + + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); + + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegClass(Src1RC, AMDGPU::sub1); + + MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .addReg(CarryReg, RegState::Define) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0) + .addImm(0); // clamp bit + + unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; + MachineInstr *HiHalf = + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1) + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit + + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + TII->legalizeOperands(*LoHalf); + TII->legalizeOperands(*HiHalf); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + // This pseudo has a chance to be selected + // only from uniform add/subcarry node. All the VGPR operands + // therefore assumed to be splat vectors. + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineBasicBlock::iterator MII = MI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &CarryDest = MI.getOperand(1); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + MachineOperand &Src2 = MI.getOperand(4); + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::S_ADDC_U32 + : AMDGPU::S_SUBB_U32; + if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { + Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) + .addReg(Src0.getReg()); + Src0.setReg(RegOp0); + } + if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { + Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) + .addReg(Src1.getReg()); + Src1.setReg(RegOp1); + } + Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + if (TRI->isVectorRegister(MRI, Src2.getReg())) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) + .addReg(Src2.getReg()); + Src2.setReg(RegOp2); + } + + if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) + .addReg(Src2.getReg()) + .addImm(0); + } else { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32)) + .addReg(Src2.getReg()) + .addImm(0); + } + + BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg()) + .addReg(AMDGPU::SCC); MI.eraseFromParent(); return BB; } @@ -3741,12 +4022,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: + case AMDGPU::SI_INDIRECT_SRC_V32: return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: + case AMDGPU::SI_INDIRECT_DST_V32: return emitIndirectDst(MI, *BB, *getSubtarget()); case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: case AMDGPU::SI_KILL_I1_PSEUDO: @@ -3870,6 +4153,75 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } return emitGWSMemViolTestLoop(MI, BB); + case AMDGPU::S_SETREG_B32: { + if (!getSubtarget()->hasDenormModeInst()) + return BB; + + // Try to optimize cases that only set the denormal mode or rounding mode. + // + // If the s_setreg_b32 fully sets all of the bits in the rounding mode or + // denormal mode to a constant, we can use s_round_mode or s_denorm_mode + // instead. + // + // FIXME: This could be predicates on the immediate, but tablegen doesn't + // allow you to have a no side effect instruction in the output of a + // sideeffecting pattern. + + // TODO: Should also emit a no side effects pseudo if only FP bits are + // touched, even if not all of them or to a variable. + unsigned ID, Offset, Width; + AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); + if (ID != AMDGPU::Hwreg::ID_MODE) + return BB; + + const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); + const unsigned SetMask = WidthMask << Offset; + unsigned SetDenormOp = 0; + unsigned SetRoundOp = 0; + + // The dedicated instructions can only set the whole denorm or round mode at + // once, not a subset of bits in either. + if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | + AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) { + // If this fully sets both the round and denorm mode, emit the two + // dedicated instructions for these. + assert(Offset == 0); + SetRoundOp = AMDGPU::S_ROUND_MODE; + SetDenormOp = AMDGPU::S_DENORM_MODE; + } else if (Width == 4) { + if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) { + SetRoundOp = AMDGPU::S_ROUND_MODE; + assert(Offset == 0); + } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) { + SetDenormOp = AMDGPU::S_DENORM_MODE; + assert(Offset == 4); + } + } + + if (SetRoundOp || SetDenormOp) { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); + if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { + unsigned ImmVal = Def->getOperand(1).getImm(); + if (SetRoundOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) + .addImm(ImmVal & 0xf); + + // If we also have the denorm mode, get just the denorm mode bits. + ImmVal >>= 4; + } + + if (SetDenormOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) + .addImm(ImmVal & 0xf); + } + + MI.eraseFromParent(); + } + } + + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -3925,10 +4277,13 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: { - // This is as fast on some subtargets. However, we always have full rate f32 - // mad available which returns the same result as the separate operations - // which we should prefer over fma. We can't use this if we want to support - // denormals, so only report this in these cases. + // If mad is not available this depends only on if f32 fma is full rate. + if (!Subtarget->hasMadMacF32Insts()) + return Subtarget->hasFastFMAF32(); + + // Otherwise f32 mad is always full rate and returns the same result as + // the separate operations so should be preferred over fma. + // However does not support denomals. if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); @@ -3946,13 +4301,14 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, return false; } -bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG, - const SDNode *N) const { +bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, + const SDNode *N) const { // TODO: Check future ftz flag // v_mad_f32/v_mac_f32 do not support denormals. EVT VT = N->getValueType(0); if (VT == MVT::f32) - return !hasFP32Denormals(DAG.getMachineFunction()); + return Subtarget->hasMadMacF32Insts() && + !hasFP32Denormals(DAG.getMachineFunction()); if (VT == MVT::f16) { return Subtarget->hasMadF16() && !hasFP64FP16Denormals(DAG.getMachineFunction()); @@ -3971,7 +4327,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4f16); + assert(VT == MVT::v4f16 || VT == MVT::v4i16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4080,6 +4436,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: case ISD::FNEG: case ISD::FCANONICALIZE: + case ISD::BSWAP: return splitUnaryVectorOp(Op, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: @@ -4101,6 +4458,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); + case ISD::SMULO: + case ISD::UMULO: + return lowerXMULO(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG); } return SDValue(); } @@ -4204,9 +4566,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - int CondCode = CD->getSExtValue(); - if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || - CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) + unsigned CondCode = CD->getZExtValue(); + if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); @@ -4241,11 +4602,9 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, EVT VT = N->getValueType(0); const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - int CondCode = CD->getSExtValue(); - if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || - CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) { + unsigned CondCode = CD->getZExtValue(); + if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); - } SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); @@ -4268,6 +4627,43 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, return DAG.getZExtOrTrunc(SetCC, SL, VT); } +static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(1); + SDLoc SL(N); + + if (Src.getOpcode() == ISD::SETCC) { + // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), + Src.getOperand(1), Src.getOperand(2)); + } + if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { + // (ballot 0) -> 0 + if (Arg->isNullValue()) + return DAG.getConstant(0, SL, VT); + + // (ballot 1) -> EXEC/EXEC_LO + if (Arg->isOne()) { + Register Exec; + if (VT.getScalarSizeInBits() == 32) + Exec = AMDGPU::EXEC_LO; + else if (VT.getScalarSizeInBits() == 64) + Exec = AMDGPU::EXEC; + else + return SDValue(); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT); + } + } + + // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) + // ISD::SETNE) + return DAG.getNode( + AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), + DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -4440,9 +4836,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { // FIXME: Either avoid relying on address space here or change the default // address space for functions to avoid the explicit check. return (GV->getValueType()->isFunctionTy() || - GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + !isNonGlobalAddrSpace(GV->getAddressSpace())) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4451,6 +4845,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); } +bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { + if (!GV->hasExternalLinkage()) + return true; + + const auto OS = getTargetMachine().getTargetTriple().getOS(); + return OS == Triple::AMDHSA || OS == Triple::AMDPAL; +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -4470,16 +4872,10 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, } else { // Get the target from BR if we don't negate the condition BR = findUser(BRCOND, ISD::BR); + assert(BR && "brcond missing unconditional branch user"); Target = BR->getOperand(1); } - // FIXME: This changes the types of the intrinsics instead of introducing new - // nodes with the correct types. - // e.g. llvm.amdgcn.loop - - // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 - // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> - unsigned CFNode = isCFIntrinsic(Intr); if (CFNode == 0) { // This is a uniform branch so we don't need to legalize. @@ -4524,7 +4920,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, }; SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); - BR = NewBR.getNode(); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -4577,13 +4972,14 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } -SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, +SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, EVT VT) const { return Op.getValueType().bitsLE(VT) ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : - DAG.getNode(ISD::FTRUNC, DL, VT, Op); + DAG.getNode(ISD::FP_ROUND, DL, VT, Op, + DAG.getTargetConstant(0, DL, MVT::i32)); } SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -4609,7 +5005,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); bool IsIEEEMode = Info->getMode().IEEE; - // FIXME: Assert during eslection that this is only selected for + // FIXME: Assert during selection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee // mode functions, but this happens to be OK since it's only done in cases // where there is known no sNaN. @@ -4621,6 +5017,42 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + bool isSigned = Op.getOpcode() == ISD::SMULO; + + if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { + const APInt &C = RHSC->getAPIntValue(); + // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } + if (C.isPowerOf2()) { + // smulo(x, signed_min) is same as umulo(x, signed_min). + bool UseArithShift = isSigned && !C.isMinSignedValue(); + SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); + SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt); + SDValue Overflow = DAG.getSetCC(SL, MVT::i1, + DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, + SL, VT, Result, ShiftAmt), + LHS, ISD::SETNE); + return DAG.getMergeValues({ Result, Overflow }, SL); + } + } + + SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); + SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, + SL, VT, LHS, RHS); + + SDValue Sign = isSigned + ? DAG.getNode(ISD::SRA, SL, VT, Result, + DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32)) + : DAG.getConstant(0, SL, VT); + SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); + + return DAG.getMergeValues({ Result, Overflow }, SL); +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); @@ -4694,7 +5126,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); assert(UserSGPR != AMDGPU::NoRegister); SDValue QueuePtr = CreateLiveInRegister( @@ -4765,6 +5197,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, } } + if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + Src.getValueType() == MVT::i64) + return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + // global <-> flat are no-ops and never emitted. const MachineFunction &MF = DAG.getMachineFunction(); @@ -5036,8 +5472,9 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, - const SDLoc &DL, unsigned Offset, EVT PtrVT, + const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags = SIInstrInfo::MO_NONE) { + assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: // @@ -5086,9 +5523,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - (!GV->hasExternalLinkage() || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || + shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -5114,11 +5549,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); - unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + Align Alignment = DataLayout.getABITypeAlign(PtrTy); MachinePointerInfo PtrInfo = MachinePointerInfo::getGOT(DAG.getMachineFunction()); - return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } @@ -5144,8 +5579,8 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, 4, false); + SDValue Param = lowerKernargMemParameter( + DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -5181,6 +5616,9 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, } else if (Elts.size() == 2) { Type = MVT::v2f32; NumElts = 2; + } else if (Elts.size() == 3) { + Type = MVT::v3f32; + NumElts = 3; } else if (Elts.size() <= 4) { Type = MVT::v4f32; NumElts = 4; @@ -5230,6 +5668,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, + SDValue Src, int ExtraElts) { + EVT SrcVT = Src.getValueType(); + + SmallVector<SDValue, 8> Elts; + + if (SrcVT.isVector()) + DAG.ExtractVectorElements(Src, Elts); + else + Elts.push_back(Src); + + SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType()); + while (ExtraElts--) + Elts.push_back(Undef); + + return DAG.getBuildVector(CastVT, DL, Elts); +} + // Re-construct the required return value for a image load intrinsic. // This is more complicated due to the optional use TexFailCtrl which means the required // return type is an aggregate @@ -5241,76 +5697,56 @@ static SDValue constructRetValue(SelectionDAG &DAG, const SDLoc &DL, LLVMContext &Context) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; - EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; - EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) - : AdjEltVT - : ReqRetVT; - - // Extract data part of the result - // Bitcast the result to the same type as the required return type - int NumElts; - if (IsD16 && !Unpacked) - NumElts = NumVDataDwords << 1; - else - NumElts = NumVDataDwords; + int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ? + ReqRetNumElts : (ReqRetNumElts + 1) / 2; - EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) - : AdjEltVT; + int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? + DMaskPop : (DMaskPop + 1) / 2; - // Special case for v6f16. Rather than add support for this, use v3i32 to - // extract the data elements - bool V6F16Special = false; - if (NumElts == 6) { - CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); - DMaskPop >>= 1; - ReqRetNumElts >>= 1; - V6F16Special = true; - AdjVT = MVT::v2i32; - } + MVT DataDwordVT = NumDataDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); - SDValue N = SDValue(Result, 0); - SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + MVT MaskPopVT = MaskPopDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); - // Iterate over the result - SmallVector<SDValue, 4> BVElts; + SDValue Data(Result, 0); + SDValue TexFail; - if (CastVT.isVector()) { - DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); - } else { - BVElts.push_back(CastRes); - } - int ExtraElts = ReqRetNumElts - DMaskPop; - while(ExtraElts--) - BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + if (IsTexFail) { + SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); + if (MaskPopVT.isVector()) { + Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } else { + Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } - SDValue PreTFCRes; - if (ReqRetNumElts > 1) { - SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); - if (IsD16 && Unpacked) - PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); - else - PreTFCRes = NewVec; - } else { - PreTFCRes = BVElts[0]; + TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } - if (V6F16Special) - PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + if (DataDwordVT.isVector()) + Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, + NumDataDwords - MaskPopDwords); - if (!IsTexFail) { - if (Result->getNumValues() > 1) - return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); - else - return PreTFCRes; - } + if (IsD16) + Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); + + if (!ReqRetVT.isVector()) + Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); + + Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); - // Extract the TexFail result and insert into aggregate return - SmallVector<SDValue, 1> TFCElt; - DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); - SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); - return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); + if (TexFail) + return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + + if (Result->getNumValues() == 1) + return Data; + + return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); } static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, @@ -5331,6 +5767,35 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } +static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl<SDValue> &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { + SDLoc DL(Op); + for (unsigned I = DimIdx; I < EndIdx; I++) { + SDValue Addr = Op.getOperand(I); + + // Gradients are packed with undef for each coordinate. + // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: + // 1D: undef,dx/dh; undef,dx/dv + // 2D: dy/dh,dx/dh; dy/dv,dx/dv + // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv + if (((I + 1) >= EndIdx) || + ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || + I == DimIdx + NumGradients - 1))) { + if (Addr.getValueType() != MVT::i16) + Addr = DAG.getBitcast(MVT::i16, Addr); + Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); + } else { + Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); + I++; + } + Addr = DAG.getBitcast(MVT::f32, Addr); + PackedAddrs.push_back(Addr); + } +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -5350,6 +5815,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsG16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; @@ -5456,41 +5922,67 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - // Check for 16 bit addresses and pack if true. + // Push back extra arguments. + for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++) + VAddrs.push_back(Op.getOperand(AddrIdx + I)); + + // Check for 16 bit addresses or derivatives and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + unsigned CoordIdx = DimIdx + NumGradients; + unsigned CoordsEnd = AddrIdx + NumMIVAddrs; + MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); - const MVT VAddrScalarVT = VAddrVT.getScalarType(); - if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) && - ST->hasFeature(AMDGPU::FeatureR128A16)) { - IsA16 = true; - const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; - for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { - SDValue AddrLo, AddrHi; - // Push back extra arguments. - if (i < DimIdx) { - AddrLo = Op.getOperand(i); - } else { - AddrLo = Op.getOperand(i); - // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, - // in 1D, derivatives dx/dh and dx/dv are packed with undef. - if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || - ((NumGradients / 2) % 2 == 1 && - (i == DimIdx + (NumGradients / 2) - 1 || - i == DimIdx + NumGradients - 1))) { - AddrHi = DAG.getUNDEF(MVT::f16); - } else { - AddrHi = Op.getOperand(i + 1); - i++; - } - AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT, - {AddrLo, AddrHi}); - AddrLo = DAG.getBitcast(MVT::i32, AddrLo); + MVT VAddrScalarVT = VAddrVT.getScalarType(); + MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + + VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType(); + VAddrScalarVT = VAddrVT.getScalarType(); + IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + if (IsA16 || IsG16) { + if (IsA16) { + if (!ST->hasA16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit addresses\n"); + return Op; + } + if (!IsG16) { + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "need 16 bit derivatives but got 32 bit derivatives\n"); + return Op; } - VAddrs.push_back(AddrLo); + } else if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; + } + + if (BaseOpcode->Gradients && !IsA16) { + if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; + } + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } + + // Don't compress addresses for G16 + const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx; + packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (unsigned I = CoordIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } } else { - for (unsigned i = 0; i < NumMIVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); + for (unsigned I = DimIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } // If the register allocator cannot place the address registers contiguously @@ -5557,8 +6049,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } EVT NewVT = NumVDataDwords > 1 ? - EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) - : MVT::f32; + EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords) + : MVT::i32; ResultTypes[0] = NewVT; if (ResultTypes.size() == 3) { @@ -5603,10 +6095,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); - Ops.push_back(IsA16 && // a16 or r128 + Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - Ops.push_back(TFE); // tfe - Ops.push_back(LWE); // lwe + if (IsGFX10) + Ops.push_back(IsA16 ? True : False); + Ops.push_back(TFE); + Ops.push_back(LWE); if (!IsGFX10) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -5655,26 +6149,25 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, SDValue DLC, + SDValue Offset, SDValue CachePolicy, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const DataLayout &DataLayout = DAG.getDataLayout(); - unsigned Align = - DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + Align Alignment = + DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, - VT.getStoreSize(), Align); + VT.getStoreSize(), Alignment); if (!Offset->isDivergent()) { SDValue Ops[] = { Rsrc, Offset, // Offset - GLC, - DLC, + CachePolicy }; // Widen vec3 load to vec4. @@ -5684,9 +6177,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, auto WidenedOp = DAG.getMemIntrinsicNode( AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); - auto Subvector = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, - DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, + DAG.getVectorIdxConstant(0, DL)); return Subvector; } @@ -5705,11 +6197,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, if (NumElts == 8 || NumElts == 16) { NumLoads = NumElts / 4; - LoadVT = MVT::v4i32; + LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); } SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); - unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); SDValue Ops[] = { DAG.getEntryNode(), // Chain Rsrc, // rsrc @@ -5717,13 +6208,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, {}, // voffset {}, // soffset {}, // offset - DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy + CachePolicy, // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. - setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + setBufferOffsets(Offset, DAG, &Ops[3], + NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { @@ -5732,7 +6224,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, LoadVT, MMO, DAG)); } - if (VT == MVT::v8i32 || VT == MVT::v16i32) + if (NumElts == 8 || NumElts == 16) return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); return Loads[0]; @@ -5777,6 +6269,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { + if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { + // This only makes sense to call in a kernel, so just lower to null. + return DAG.getConstant(0, DL, VT); + } + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } @@ -5790,8 +6287,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rsq_legacy: if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); - - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + return SDValue(); case Intrinsic::amdgcn_rcp_legacy: if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); @@ -5815,37 +6311,43 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, 4, false); + SI::KernelInputOffsets::NGROUPS_X, Align(4), + false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, 4, false); + SI::KernelInputOffsets::NGROUPS_Y, Align(4), + false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, 4, false); + SI::KernelInputOffsets::NGROUPS_Z, Align(4), + false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, + Align(4), false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, + Align(4), false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, + Align(4), false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -5865,29 +6367,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: - case Intrinsic::r600_read_tgid_x: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); case Intrinsic::amdgcn_workgroup_id_y: - case Intrinsic::r600_read_tgid_y: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); case Intrinsic::amdgcn_workgroup_id_z: - case Intrinsic::r600_read_tgid_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: - case Intrinsic::r600_read_tidig_x: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::r600_read_tidig_y: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::r600_read_tidig_z: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); @@ -5901,53 +6397,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, IsGFX10 ? &DLC : nullptr)) return Op; - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_p1_f16: { - SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, - Op.getOperand(5), SDValue()); - if (getSubtarget()->getLDSBankCount() == 16) { - // 16 bank LDS - - // FIXME: This implicitly will insert a second CopyToReg to M0. - SDValue S = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, - DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Op.getOperand(5)); // m0 - - SDValue Ops[] = { - Op.getOperand(1), // Src0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - S, // Src2 - holds two f16 values selected by high - DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers - Op.getOperand(4), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - DAG.getTargetConstant(0, DL, MVT::i32) // $omod - }; - return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); - } else { - // 32 bank LDS - SDValue Ops[] = { - Op.getOperand(1), // Src0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - Op.getOperand(4), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - DAG.getTargetConstant(0, DL, MVT::i32), // $omod - ToM0.getValue(1) - }; - return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); - } - } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); @@ -5988,9 +6442,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::amdgcn_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); @@ -6020,6 +6471,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fcmp: { return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); } + case Intrinsic::amdgcn_ballot: + return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -6098,6 +6551,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, SL, MVT::i32)); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } + case Intrinsic::amdgcn_alignbit: + return DAG.getNode(ISD::FSHR, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_reloc_constant: { + Module *M = const_cast<Module *>(MF.getFunction().getParent()); + const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); + auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); + auto RelocSymbol = cast<GlobalVariable>( + M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); + SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -6131,6 +6597,28 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset, cast<ConstantSDNode>(Offset)->getSExtValue(); } +static unsigned getDSShaderTypeValue(const MachineFunction &MF) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_PS: + return 1; + case CallingConv::AMDGPU_VS: + return 2; + case CallingConv::AMDGPU_GS: + return 3; + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_ES: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::C: + case CallingConv::Fast: + default: + // Assume other calling conventions are various compute callable functions + return 0; + } +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -6146,8 +6634,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned IndexOperand = M->getConstantOperandVal(7); unsigned WaveRelease = M->getConstantOperandVal(8); unsigned WaveDone = M->getConstantOperandVal(9); - unsigned ShaderType; - unsigned Instruction; unsigned OrderedCountIndex = IndexOperand & 0x3f; IndexOperand &= ~0x3f; @@ -6166,36 +6652,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (IndexOperand) report_fatal_error("ds_ordered_count: bad index operand"); - switch (IntrID) { - case Intrinsic::amdgcn_ds_ordered_add: - Instruction = 0; - break; - case Intrinsic::amdgcn_ds_ordered_swap: - Instruction = 1; - break; - } - if (WaveDone && !WaveRelease) report_fatal_error("ds_ordered_count: wave_done requires wave_release"); - switch (DAG.getMachineFunction().getFunction().getCallingConv()) { - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - ShaderType = 0; - break; - case CallingConv::AMDGPU_PS: - ShaderType = 1; - break; - case CallingConv::AMDGPU_VS: - ShaderType = 2; - break; - case CallingConv::AMDGPU_GS: - ShaderType = 3; - break; - default: - report_fatal_error("ds_ordered_count unsupported for this calling conv"); - } - + unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; + unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction()); unsigned Offset0 = OrderedCountIndex << 2; unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | (Instruction << 4); @@ -6425,6 +6886,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_csub: case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_buffer_atomic_smax: @@ -6467,6 +6929,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_sub: Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; break; + case Intrinsic::amdgcn_buffer_atomic_csub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; + break; case Intrinsic::amdgcn_buffer_atomic_smin: Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; break; @@ -6715,6 +7180,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_global_atomic_csub: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -6750,9 +7227,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, WidenedMemVT, MMO); if (WidenedVT != VT) { - auto Extract = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, - DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getVectorIdxConstant(0, DL)); NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); } return NewOp; @@ -6792,52 +7268,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { - case Intrinsic::amdgcn_exp: { - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9)); - - const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - Op.getOperand(4), // src0 - Op.getOperand(5), // src1 - Op.getOperand(6), // src2 - Op.getOperand(7), // src3 - DAG.getTargetConstant(0, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) - }; - - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); - } case Intrinsic::amdgcn_exp_compr: { - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); SDValue Src0 = Op.getOperand(4); SDValue Src1 = Op.getOperand(5); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7)); + // Hack around illegal type on SI by directly selecting it. + if (isTypeLegal(Src0.getValueType())) + return SDValue(); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); SDValue Undef = DAG.getUNDEF(MVT::f32); const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), + Op.getOperand(2), // tgt + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 Undef, // src2 Undef, // src3 + Op.getOperand(7), // vm DAG.getTargetConstant(1, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + Op.getOperand(3), // en + Op.getOperand(0) // Chain }; - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE; + return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { @@ -7183,13 +7636,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - unsigned Align) const { + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, + Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -7202,7 +7656,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Align)) { + Subtarget, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -7413,7 +7867,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS && + !Subtarget->hasMultiDwordFlatScratchAddressing()) AS = MFI->hasFlatScratchInit() ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; @@ -7438,7 +7893,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && Alignment >= 4 && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); @@ -7547,55 +8002,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) + bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasApproximateFuncs(); + + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + if (!AllowInaccurateRcp) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { - if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { - if (CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - // - // v_rcp_f16 and v_rsq_f16 DO support denormals. - - // 1.0 / sqrt(x) -> rsq(x) - - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - } + if (CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. - // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { - // -1.0 / x -> rcp (fneg x) - SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); - } + // 1.0 / sqrt(x) -> rsq(x) + + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); } - } - if (Unsafe) { - // Turn into multiply by the reciprocal. - // x / y -> x * (1.0 / y) - SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + // -1.0 / x -> rcp (fneg x) + SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); + } } - return SDValue(); + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, - EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + EVT VT, SDValue A, SDValue B, SDValue GlueChain, + SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B); + return DAG.getNode(Opcode, SL, VT, A, B, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7608,15 +8062,16 @@ static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, + Flags); } static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, - SDValue GlueChain) { + SDValue GlueChain, SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B, C); + return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7629,8 +8084,9 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, + Flags); } SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { @@ -7704,6 +8160,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; + // The selection matcher assumes anything with a chain selecting to a + // mayRaiseFPException machine instruction. Since we're introducing a chain + // here, we need to explicitly report nofpexcept for the regular fdiv + // lowering. + SDNodeFlags Flags = Op->getFlags(); + Flags.setNoFPExcept(true); + SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -7713,95 +8176,100 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - RHS, RHS, LHS); + {RHS, RHS, LHS}, Flags); SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - LHS, RHS, LHS); + {LHS, RHS, LHS}, Flags); // Denominator is scaled to not be denormal, so using rcp is ok. SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); + const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); if (!HasFP32Denormals) { + // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV + // lowering. The chain dependence is insufficient, and we need glue. We do + // not need the glue variants in a strictfp function. + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue EnableDenorm; + SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue); + DAG.getEntryNode(), EnableDenormValue).getNode(); } else { const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); - EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue, - BitField); + EnableDenorm = + DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, + {EnableDenormValue, BitField, DAG.getEntryNode()}); } SDValue Ops[3] = { NegDivScale0, - EnableDenorm.getValue(0), - EnableDenorm.getValue(1) + SDValue(EnableDenorm, 0), + SDValue(EnableDenorm, 1) }; NegDivScale0 = DAG.getMergeValues(Ops, SL); } SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, - ApproxRcp, One, NegDivScale0); + ApproxRcp, One, NegDivScale0, Flags); SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, - ApproxRcp, Fma0); + ApproxRcp, Fma0, Flags); SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, - Fma1, Fma1); + Fma1, Fma1, Flags); SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, - NumeratorScaled, Mul); + NumeratorScaled, Mul, Flags); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, + Fma2, Fma1, Mul, Fma2, Flags); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, - NumeratorScaled, Fma3); + NumeratorScaled, Fma3, Flags); if (!HasFP32Denormals) { - SDValue DisableDenorm; + SDNode *DisableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1), DisableDenormValue, - Fma4.getValue(2)); + Fma4.getValue(2)).getNode(); } else { const SDValue DisableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), DisableDenormValue, - BitField, Fma4.getValue(2)); + DisableDenorm = DAG.getMachineNode( + AMDGPU::S_SETREG_B32, SL, MVT::Other, + {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - DisableDenorm, DAG.getRoot()); + SDValue(DisableDenorm, 0), DAG.getRoot()); DAG.setRoot(OutputChain); } SDValue Scale = NumeratorScaled.getValue(1); SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, - Fma4, Fma1, Fma3, Scale); + {Fma4, Fma1, Fma3, Scale}, Flags); - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { @@ -7916,7 +8384,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS && + !Subtarget->hasMultiDwordFlatScratchAddressing()) AS = MFI->hasFlatScratchInit() ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; @@ -7976,22 +8445,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue Arg = Op.getOperand(0); SDValue TrigVal; - // TODO: Should this propagate fast-math-flags? + // Propagate fast-math flags so that the multiply we introduce can be folded + // if Arg is already the result of a multiply by constant. + auto Flags = Op->getFlags(); - SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT); + SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); if (Subtarget->hasTrigReducedRange()) { - SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); - TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal); + SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); + TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); } else { - TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); + TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); } switch (Op.getOpcode()) { case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal); + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal); + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); default: llvm_unreachable("Wrong trig opcode"); } @@ -8032,7 +8503,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); - if (ScalarVT != MVT::f32) + if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -8047,8 +8518,14 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, // about in practice. if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); DCI.AddToWorklist(Cvt.getNode()); + + // For the f16 case, fold to a cast to f32 and then cast back to f16. + if (ScalarVT != MVT::f32) { + Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, + DAG.getTargetConstant(0, DL, MVT::i32)); + } return Cvt; } } @@ -8525,7 +9002,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } - if (VT != MVT::i64) + if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) return SDValue(); // TODO: This could be a generic combine with a predicate for extracting the @@ -8735,6 +9212,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, N->getFlags()); } + if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, + N0.getOperand(0), N->getFlags()); + } + return AMDGPUTargetLowering::performRcpCombine(N, DCI); } @@ -8776,9 +9258,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case AMDGPUISD::RSQ: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::TRIG_PREOP: case AMDGPUISD::DIV_SCALE: case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: @@ -8881,6 +9361,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_fdot2: + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_trig_preop: return true; default: break; @@ -9099,8 +9585,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, return SDValue(); // Ordered >= (although NaN inputs should have folded away by now). - APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); - if (Cmp == APFloat::cmpGreaterThan) + if (K0->getValueAPF() > K1->getValueAPF()) return SDValue(); const MachineFunction &MF = DAG.getMachineFunction(); @@ -9275,6 +9760,50 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, return SDValue(); } +// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be +// expanded into a set of cmp/select instructions. +bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, + unsigned NumElem, + bool IsDivergentIdx) { + if (UseDivergentRegisterIndexing) + return false; + + unsigned VecSize = EltSize * NumElem; + + // Sub-dword vectors of size 2 dword or less have better implementation. + if (VecSize <= 64 && EltSize < 32) + return false; + + // Always expand the rest of sub-dword instructions, otherwise it will be + // lowered via memory. + if (EltSize < 32) + return true; + + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (IsDivergentIdx) + return true; + + // Large vectors would yield too many compares and v_cndmask_b32 instructions. + unsigned NumInsts = NumElem /* Number of compares */ + + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; + return NumInsts <= 16; +} + +static bool shouldExpandVectorDynExt(SDNode *N) { + SDValue Idx = N->getOperand(N->getNumOperands() - 1); + if (isa<ConstantSDNode>(Idx)) + return false; + + SDValue Vec = N->getOperand(0); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + unsigned NumElem = VecVT.getVectorNumElements(); + + return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, + Idx->isDivergent()); +} + SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); @@ -9336,18 +9865,12 @@ SDValue SITargetLowering::performExtractVectorEltCombine( unsigned EltSize = EltVT.getSizeInBits(); // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && - !isa<ConstantSDNode>(N->getOperand(1))) { + if (::shouldExpandVectorDynExt(N)) { SDLoc SL(N); SDValue Idx = N->getOperand(1); - EVT IdxVT = Idx.getValueType(); SDValue V; for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { - SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue IC = DAG.getVectorIdxConstant(I, SL); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); if (I == 0) V = Elt; @@ -9402,17 +9925,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N, SDValue Idx = N->getOperand(2); EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - unsigned VecSize = VecVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); // INSERT_VECTOR_ELT (<n x e>, var-idx) // => BUILD_VECTOR n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - if (isa<ConstantSDNode>(Idx) || - VecSize > 256 || (VecSize <= 64 && EltSize < 32)) + if (!::shouldExpandVectorDynExt(N)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -9919,39 +10435,50 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; SDValue Src = N->getOperand(0); - SDValue Srl = N->getOperand(0); - if (Srl.getOpcode() == ISD::ZERO_EXTEND) - Srl = Srl.getOperand(0); + SDValue Shift = N->getOperand(0); - // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. - if (Srl.getOpcode() == ISD::SRL) { + // TODO: Extend type shouldn't matter (assuming legal types). + if (Shift.getOpcode() == ISD::ZERO_EXTEND) + Shift = Shift.getOperand(0); + + if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { + // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x + // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x - // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x - - if (const ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { - Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), - EVT(MVT::i32)); + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { + Shift = DAG.getZExtOrTrunc(Shift.getOperand(0), + SDLoc(Shift.getOperand(0)), MVT::i32); + + unsigned ShiftOffset = 8 * Offset; + if (Shift.getOpcode() == ISD::SHL) + ShiftOffset -= C->getZExtValue(); + else + ShiftOffset += C->getZExtValue(); - unsigned SrcOffset = C->getZExtValue() + 8 * Offset; - if (SrcOffset < 32 && SrcOffset % 8 == 0) { - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, - MVT::f32, Srl); + if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, + MVT::f32, Shift); } } } - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); + APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) { + // We simplified Src. If this node is not dead, visit it again so it is + // folded properly. + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); } + // Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (SDValue DemandedSrc = + TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) + return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); + return SDValue(); } @@ -9964,16 +10491,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); - APFloat::cmpResult Cmp0 = F.compare(Zero); - if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && - MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { + if (F < Zero || + (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } APFloat One(F.getSemantics(), "1.0"); - APFloat::cmpResult Cmp1 = F.compare(One); - if (Cmp1 == APFloat::cmpGreaterThan) + if (F > One) return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); return SDValue(CSrc, 0); @@ -10061,10 +10585,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { + // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(0); if (Src.isUndef()) return Src; @@ -10406,24 +10930,6 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - case AMDGPU::V_PERMLANE16_B32: - case AMDGPU::V_PERMLANEX16_B32: { - ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0)); - ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2)); - if (!FI->getZExtValue() && !BC->getZExtValue()) - break; - SDValue VDstIn = Node->getOperand(6); - if (VDstIn.isMachineOpcode() - && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) - break; - MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, - SDLoc(Node), MVT::i32); - SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1), - SDValue(BC, 0), Node->getOperand(3), - Node->getOperand(4), Node->getOperand(5), - SDValue(ImpDef, 0), Node->getOperand(7) }; - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } default: break; } @@ -10592,89 +11098,50 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { + const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': case 'r': - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::SReg_32RegClass; break; case 64: RC = &AMDGPU::SGPR_64RegClass; break; - case 96: - RC = &AMDGPU::SReg_96RegClass; - break; - case 128: - RC = &AMDGPU::SGPR_128RegClass; - break; - case 160: - RC = &AMDGPU::SReg_160RegClass; - break; - case 256: - RC = &AMDGPU::SReg_256RegClass; - break; - case 512: - RC = &AMDGPU::SReg_512RegClass; + default: + RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; } break; case 'v': - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::VGPR_32RegClass; break; - case 64: - RC = &AMDGPU::VReg_64RegClass; - break; - case 96: - RC = &AMDGPU::VReg_96RegClass; - break; - case 128: - RC = &AMDGPU::VReg_128RegClass; - break; - case 160: - RC = &AMDGPU::VReg_160RegClass; - break; - case 256: - RC = &AMDGPU::VReg_256RegClass; - break; - case 512: - RC = &AMDGPU::VReg_512RegClass; + default: + RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; } break; case 'a': if (!Subtarget->hasMAIInsts()) break; - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::AGPR_32RegClass; break; - case 64: - RC = &AMDGPU::AReg_64RegClass; - break; - case 128: - RC = &AMDGPU::AReg_128RegClass; - break; - case 512: - RC = &AMDGPU::AReg_512RegClass; + default: + RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; - case 1024: - RC = &AMDGPU::AReg_1024RegClass; - // v32 types are not legal but we support them here. - return std::make_pair(0U, RC); } break; } @@ -10701,9 +11168,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(RC->getRegister(Idx), RC); } } + + // FIXME: Returns VS_32 for physical SGPR constraints return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } +static bool isImmConstraint(StringRef Constraint) { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'I': + case 'J': + case 'A': + case 'B': + case 'C': + return true; + } + } else if (Constraint == "DA" || + Constraint == "DB") { + return true; + } + return false; +} + SITargetLowering::ConstraintType SITargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { @@ -10715,9 +11202,115 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } } + if (isImmConstraint(Constraint)) { + return C_Other; + } return TargetLowering::getConstraintType(Constraint); } +static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { + if (!AMDGPU::isInlinableIntLiteral(Val)) { + Val = Val & maskTrailingOnes<uint64_t>(Size); + } + return Val; +} + +void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const { + if (isImmConstraint(Constraint)) { + uint64_t Val; + if (getAsmOperandConstVal(Op, Val) && + checkAsmConstraintVal(Op, Constraint, Val)) { + Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits()); + Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); + } + } else { + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); + } +} + +bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { + unsigned Size = Op.getScalarValueSizeInBits(); + if (Size > 64) + return false; + + if (Size == 16 && !Subtarget->has16BitInsts()) + return false; + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + Val = C->getSExtValue(); + return true; + } + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) { + Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) { + if (Size != 16 || Op.getNumOperands() != 2) + return false; + if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef()) + return false; + if (ConstantSDNode *C = V->getConstantSplatNode()) { + Val = C->getSExtValue(); + return true; + } + if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { + Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + } + + return false; +} + +bool SITargetLowering::checkAsmConstraintVal(SDValue Op, + const std::string &Constraint, + uint64_t Val) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'I': + return AMDGPU::isInlinableIntLiteral(Val); + case 'J': + return isInt<16>(Val); + case 'A': + return checkAsmConstraintValA(Op, Val); + case 'B': + return isInt<32>(Val); + case 'C': + return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) || + AMDGPU::isInlinableIntLiteral(Val); + default: + break; + } + } else if (Constraint.size() == 2) { + if (Constraint == "DA") { + int64_t HiBits = static_cast<int32_t>(Val >> 32); + int64_t LoBits = static_cast<int32_t>(Val); + return checkAsmConstraintValA(Op, HiBits, 32) && + checkAsmConstraintValA(Op, LoBits, 32); + } + if (Constraint == "DB") { + return true; + } + } + llvm_unreachable("Invalid asm constraint"); +} + +bool SITargetLowering::checkAsmConstraintValA(SDValue Op, + uint64_t Val, + unsigned MaxSize) const { + unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize); + bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); + if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) || + (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || + (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) { + return true; + } + return false; +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -10745,11 +11338,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); - } - Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { @@ -10772,15 +11360,18 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } TargetLoweringBase::finalizeLowering(MF); + + // Allocate a VGPR for future SGPR Spill if + // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used + // FIXME: We won't need this hack if we split SGPR allocation from VGPR + if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill && + !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects()) + Info->reserveVGPRforSGPRSpills(MF); } -void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, - KnownBits &Known, - const APInt &DemandedElts, - const SelectionDAG &DAG, - unsigned Depth) const { - TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, - DAG, Depth); +void SITargetLowering::computeKnownBitsForFrameIndex( + const int FI, KnownBits &Known, const MachineFunction &MF) const { + TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); // Set the high bits to zero based on the maximum allowed scratch size per // wave. We can't use vaddr in MUBUF instructions if we don't know the address @@ -10788,6 +11379,27 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } +Align SITargetLowering::computeKnownAlignForTargetInstr( + GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, + unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + // FIXME: Can this move to generic code? What about the case where the call + // site specifies a lower alignment? + Intrinsic::ID IID = MI->getIntrinsicID(); + LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); + AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); + if (MaybeAlign RetAlign = Attrs.getRetAlignment()) + return *RetAlign; + return Align(1); + } + default: + return Align(1); + } +} + Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); const Align CacheLineAlign = Align(64); @@ -10879,30 +11491,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, case ISD::CopyFromReg: { const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (Register::isPhysicalRegister(Reg)) - return !TRI.isSGPRReg(MRI, Reg); - - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (!TRI.isSGPRReg(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - return false; - } - const Value *V = FLI->getValueFromVirtualReg(Reg); - if (V) + const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + Register Reg = R->getReg(); + + // FIXME: Why does this need to consider isLiveIn? + if (Reg.isPhysical() || MRI.isLiveIn(Reg)) + return !TRI->isSGPRReg(MRI, Reg); + + if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); - return !TRI.isSGPRReg(MRI, Reg); + return !TRI->isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { @@ -11004,7 +11605,19 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { return RC; } -static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { +// FIXME: This is a workaround for DivergenceAnalysis not understanding always +// uniform values (as produced by the mask results of control flow intrinsics) +// used outside of divergent blocks. The phi users need to also be treated as +// always uniform. +static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, + unsigned WaveSize) { + // FIXME: We asssume we never cast the mask results of a control flow + // intrinsic. + // Early exit if the type won't be consistent as a compile time hack. + IntegerType *IT = dyn_cast<IntegerType>(V->getType()); + if (!IT || IT->getBitWidth() != WaveSize) + return false; + if (!isa<Instruction>(V)) return false; if (!Visited.insert(V).second) @@ -11036,7 +11649,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { } } } else { - Result = hasCFUser(U, Visited); + Result = hasCFUser(U, Visited, WaveSize); } if (Result) break; @@ -11046,36 +11659,16 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, const Value *V) const { - if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if_break: - return true; - } - } - if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { - if (const IntrinsicInst *Intrinsic = - dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if: - case Intrinsic::amdgcn_else: { - ArrayRef<unsigned> Indices = ExtValue->getIndices(); - if (Indices.size() == 1 && Indices[0] == 1) { - return true; - } - } - } - } - } if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (isa<InlineAsm>(CI->getCalledValue())) { + if (CI->isInlineAsm()) { + // FIXME: This cannot give a correct answer. This should only trigger in + // the case where inline asm returns mixed SGPR and VGPR results, used + // outside the defining block. We don't have a specific result to + // consider, so this assumes if any value is SGPR, the overall register + // also needs to be SGPR. const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); - ImmutableCallSite CS(CI); TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( - MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); for (auto &TC : TargetConstraints) { if (TC.Type == InlineAsm::isOutput) { ComputeConstraintToUse(TC, SDValue()); @@ -11095,5 +11688,20 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, } } SmallPtrSet<const Value *, 16> Visited; - return hasCFUser(V, Visited); + return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); +} + +std::pair<int, MVT> +SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const { + auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first = (Size + 255) / 256; + return Cost; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d59495b052a4f..f4c0764640575 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -42,7 +42,8 @@ private: SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, + uint64_t Offset, Align Alignment, + bool Signed, const ISD::InputArg *Arg = nullptr) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, @@ -60,7 +61,7 @@ private: SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue GLC, SDValue DLC, SelectionDAG &DAG) const; + SDValue CachePolicy, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -107,7 +108,7 @@ private: /// Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. - SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, EVT VT) const; @@ -119,6 +120,7 @@ private: /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; @@ -199,6 +201,15 @@ public: /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; + /// \returns true if this should use a literal constant for an LDS address, + /// and not emit a relocation for an LDS global. + bool shouldUseLDSConstAddress(const GlobalValue *GV) const; + + /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be + /// expanded into a set of cmp/select instructions. + static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, + bool IsDivergentIdx); + private: // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array @@ -206,7 +217,7 @@ private: /// \returns 0 If there is a non-constant offset or if the offset is 0. /// Otherwise returns the constant offset. unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + SDValue *Offsets, Align Alignment = Align(4)) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, @@ -253,15 +264,18 @@ public: MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *IsFast = nullptr) const override; - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, + EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; bool isMemOpUniform(const SDNode *N) const; bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; + static bool isNonGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || + AS == AMDGPUAS::PRIVATE_ADDRESS; + } + + // FIXME: Missing constant_32bit static bool isFlatGlobalAddrSpace(unsigned AS) { return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS || @@ -330,6 +344,9 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; @@ -351,8 +368,7 @@ public: MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; - bool isFMADLegalForFAddFSub(const SelectionDAG &DAG, - const SDNode *N) const override; + bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; @@ -377,17 +393,29 @@ public: getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; ConstraintType getConstraintType(StringRef Constraint) const override; + void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const; + bool checkAsmConstraintVal(SDValue Op, + const std::string &Constraint, + uint64_t Val) const; + bool checkAsmConstraintValA(SDValue Op, + uint64_t Val, + unsigned MaxSize = 64) const; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const; void finalizeLowering(MachineFunction &MF) const override; - void computeKnownBitsForFrameIndex(const SDValue Op, + void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, - const APInt &DemandedElts, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; + const MachineFunction &MF) const override; + Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const override; bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; @@ -432,6 +460,13 @@ public: MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + void allocateSpecialInputVGPRsFixed(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp new file mode 100644 index 0000000000000..35c49ae8c0dd1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -0,0 +1,203 @@ +//===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_clause instructions to form hard clauses. +/// +/// Clausing load instructions can give cache coherency benefits. Before gfx10, +/// the hardware automatically detected "soft clauses", which were sequences of +/// memory instructions of the same type. In gfx10 this detection was removed, +/// and the s_clause instruction was introduced to explicitly mark "hard +/// clauses". +/// +/// It's the scheduler's job to form the clauses by putting similar memory +/// instructions next to each other. Our job is just to insert an s_clause +/// instruction to mark the start of each clause. +/// +/// Note that hard clauses are very similar to, but logically distinct from, the +/// groups of instructions that have to be restartable when XNACK is enabled. +/// The rules are slightly different in each case. For example an s_nop +/// instruction breaks a restartable group, but can appear in the middle of a +/// hard clause. (Before gfx10 there wasn't a distinction, and both were called +/// "soft clauses" or just "clauses".) +/// +/// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable +/// groups, not hard clauses. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SmallVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-insert-hard-clauses" + +namespace { + +enum HardClauseType { + // Texture, buffer, global or scratch memory instructions. + HARDCLAUSE_VMEM, + // Flat (not global or scratch) memory instructions. + HARDCLAUSE_FLAT, + // Instructions that access LDS. + HARDCLAUSE_LDS, + // Scalar memory instructions. + HARDCLAUSE_SMEM, + // VALU instructions. + HARDCLAUSE_VALU, + LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU, + + // Internal instructions, which are allowed in the middle of a hard clause, + // except for s_waitcnt. + HARDCLAUSE_INTERNAL, + // Instructions that are not allowed in a hard clause: SALU, export, branch, + // message, GDS, s_waitcnt and anything else not mentioned above. + HARDCLAUSE_ILLEGAL, +}; + +HardClauseType getHardClauseType(const MachineInstr &MI) { + // On current architectures we only get a benefit from clausing loads. + if (MI.mayLoad()) { + if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) + return HARDCLAUSE_VMEM; + if (SIInstrInfo::isFLAT(MI)) + return HARDCLAUSE_FLAT; + // TODO: LDS + if (SIInstrInfo::isSMRD(MI)) + return HARDCLAUSE_SMEM; + } + + // Don't form VALU clauses. It's not clear what benefit they give, if any. + + // In practice s_nop is the only internal instruction we're likely to see. + // It's safe to treat the rest as illegal. + if (MI.getOpcode() == AMDGPU::S_NOP) + return HARDCLAUSE_INTERNAL; + return HARDCLAUSE_ILLEGAL; +} + +class SIInsertHardClauses : public MachineFunctionPass { +public: + static char ID; + + SIInsertHardClauses() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Track information about a clause as we discover it. + struct ClauseInfo { + // The type of all (non-internal) instructions in the clause. + HardClauseType Type = HARDCLAUSE_ILLEGAL; + // The first (necessarily non-internal) instruction in the clause. + MachineInstr *First = nullptr; + // The last non-internal instruction in the clause. + MachineInstr *Last = nullptr; + // The length of the clause including any internal instructions in the + // middle or after the end of the clause. + unsigned Length = 0; + // The base operands of *Last. + SmallVector<const MachineOperand *, 4> BaseOps; + }; + + bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) { + // Get the size of the clause excluding any internal instructions at the + // end. + unsigned Size = + std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1; + if (Size < 2) + return false; + assert(Size <= 64 && "Hard clause is too long!"); + + auto &MBB = *CI.First->getParent(); + auto ClauseMI = + BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE)) + .addImm(Size - 1); + finalizeBundle(MBB, ClauseMI->getIterator(), + std::next(CI.Last->getIterator())); + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasHardClauses()) + return false; + + const SIInstrInfo *SII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + + bool Changed = false; + for (auto &MBB : MF) { + ClauseInfo CI; + for (auto &MI : MBB) { + HardClauseType Type = getHardClauseType(MI); + + int64_t Dummy1; + bool Dummy2; + unsigned Dummy3; + SmallVector<const MachineOperand *, 4> BaseOps; + if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { + if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2, + Dummy3, TRI)) { + // We failed to get the base operands, so we'll never clause this + // instruction with any other, so pretend it's illegal. + Type = HARDCLAUSE_ILLEGAL; + } + } + + if (CI.Length == 64 || + (CI.Length && Type != HARDCLAUSE_INTERNAL && + (Type != CI.Type || + // Note that we lie to shouldClusterMemOps about the size of the + // cluster. When shouldClusterMemOps is called from the machine + // scheduler it limits the size of the cluster to avoid increasing + // register pressure too much, but this pass runs after register + // allocation so there is no need for that kind of limit. + !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) { + // Finish the current clause. + Changed |= emitClause(CI, SII); + CI = ClauseInfo(); + } + + if (CI.Length) { + // Extend the current clause. + ++CI.Length; + if (Type != HARDCLAUSE_INTERNAL) { + CI.Last = &MI; + CI.BaseOps = std::move(BaseOps); + } + } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { + // Start a new clause. + CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)}; + } + } + + // Finish the last clause in the basic block if any. + if (CI.Length) + Changed |= emitClause(CI, SII); + } + + return Changed; + } +}; + +} // namespace + +char SIInsertHardClauses::ID = 0; + +char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID; + +INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses", + false, false) diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 80c044ec00cb3..052db5f6ea718 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -18,9 +18,11 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -28,6 +30,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -52,21 +55,22 @@ private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; unsigned SkipThreshold = 0; + MachineDominatorTree *MDT = nullptr; + + MachineBasicBlock *EarlyExitBlock = nullptr; bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; - bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); - - void kill(MachineInstr &MI); + bool dominatesAllReachable(MachineBasicBlock &MBB); + void createEarlyExitBlock(MachineBasicBlock &MBB); + void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL); - MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + bool kill(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); - bool optimizeVccBranch(MachineInstr &MI) const; - public: static char ID; @@ -79,6 +83,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -87,8 +93,11 @@ public: char SIInsertSkips::ID = 0; -INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, - "SI insert s_cbranch_execz instructions", false, false) +INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; @@ -146,42 +155,110 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, return false; } -bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction *MF = MBB.getParent(); - - if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS || - !shouldSkip(MBB, MBB.getParent()->back())) - return false; +/// Check whether \p MBB dominates all blocks that are reachable from it. +bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { + for (MachineBasicBlock *Other : depth_first(&MBB)) { + if (!MDT->dominates(&MBB, Other)) + return false; + } + return true; +} - MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); +static void generatePsEndPgm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + const SIInstrInfo *TII) { + // Generate "null export; s_endpgm". + BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); +} - const DebugLoc &DL = MI.getDebugLoc(); +void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + DebugLoc DL; - // If the exec mask is non-zero, skip the next two instructions - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&NextBB); + assert(!EarlyExitBlock); + EarlyExitBlock = MF->CreateMachineBasicBlock(); + MF->insert(MF->end(), EarlyExitBlock); - MachineBasicBlock::iterator Insert = SkipBB->begin(); + generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII); +} - // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en +/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given +/// iterator. Only applies to pixel shaders. +void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL) { + MachineFunction *MF = MBB.getParent(); + assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + + // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a + // basic block that has no further successors (e.g., there was an + // `unreachable` there in IR). This can happen with original source of the + // form: + // + // if (uniform_condition) { + // write_to_memory(); + // discard; + // } + // + // In this case, we write the "null_export; s_endpgm" skip code in the + // already-existing basic block. + auto NextBBI = std::next(MBB.getIterator()); + bool NoSuccessor = I == MBB.end() && + llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); + + if (NoSuccessor) { + generatePsEndPgm(MBB, I, DL, TII); + } else { + if (!EarlyExitBlock) { + createEarlyExitBlock(MBB); + // Update next block pointer to reflect any new blocks + NextBBI = std::next(MBB.getIterator()); + } - // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); + auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(EarlyExitBlock); + + // Split the block if the branch will not come at the end. + auto Next = std::next(BranchMI->getIterator()); + if (Next != MBB.end() && !Next->isTerminator()) { + MachineBasicBlock *SplitBB = + MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + MF->insert(NextBBI, SplitBB); + SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end()); + SplitBB->transferSuccessorsAndUpdatePHIs(&MBB); + // FIXME: the expectation is that this will be used near the beginning + // of a block so just assume all registers are still live. + for (auto LiveIn : MBB.liveins()) + SplitBB->addLiveIn(LiveIn); + MBB.addSuccessor(SplitBB); + + // Update dominator tree + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + MDT->getBase().applyUpdates(DTUpdates); + } - return true; + MBB.addSuccessor(EarlyExitBlock); + MDT->getBase().insertEdge(&MBB, EarlyExitBlock); + } } -void SIInsertSkips::kill(MachineInstr &MI) { +/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. +/// Return true unless the terminator is a no-op. +bool SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); @@ -268,7 +345,7 @@ void SIInsertSkips::kill(MachineInstr &MI) { I.addImm(0); // omod } - break; + return true; } case AMDGPU::SI_KILL_I1_TERMINATOR: { const MachineFunction *MF = MI.getParent()->getParent(); @@ -283,11 +360,13 @@ void SIInsertSkips::kill(MachineInstr &MI) { int64_t Imm = Op.getImm(); assert(Imm == 0 || Imm == -1); - if (Imm == KillVal) + if (Imm == KillVal) { BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) .addImm(0); - break; + return true; + } + return false; } unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; @@ -296,27 +375,13 @@ void SIInsertSkips::kill(MachineInstr &MI) { BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) .addReg(Exec) .add(Op); - break; + return true; } default: llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } } -MachineBasicBlock *SIInsertSkips::insertSkipBlock( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - MachineFunction *MF = MBB.getParent(); - - MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, SkipBB); - MBB.addSuccessor(SkipBB); - - return SkipBB; -} - // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -334,143 +399,24 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, return true; } -bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { - // Match: - // sreg = -1 - // vcc = S_AND_B64 exec, sreg - // S_CBRANCH_VCC[N]Z - // => - // S_CBRANCH_EXEC[N]Z - bool Changed = false; - MachineBasicBlock &MBB = *MI.getParent(); - const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); - const bool IsWave32 = ST.isWave32(); - const unsigned CondReg = TRI->getVCC(); - const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - - MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), - E = MBB.rend(); - bool ReadsCond = false; - unsigned Threshold = 5; - for (++A ; A != E ; ++A) { - if (!--Threshold) - return false; - if (A->modifiesRegister(ExecReg, TRI)) - return false; - if (A->modifiesRegister(CondReg, TRI)) { - if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) - return false; - break; - } - ReadsCond |= A->readsRegister(CondReg, TRI); - } - if (A == E) - return false; - - MachineOperand &Op1 = A->getOperand(1); - MachineOperand &Op2 = A->getOperand(2); - if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { - TII->commuteInstruction(*A); - Changed = true; - } - if (Op1.getReg() != ExecReg) - return Changed; - if (Op2.isImm() && Op2.getImm() != -1) - return Changed; - - unsigned SReg = AMDGPU::NoRegister; - if (Op2.isReg()) { - SReg = Op2.getReg(); - auto M = std::next(A); - bool ReadsSreg = false; - for ( ; M != E ; ++M) { - if (M->definesRegister(SReg, TRI)) - break; - if (M->modifiesRegister(SReg, TRI)) - return Changed; - ReadsSreg |= M->readsRegister(SReg, TRI); - } - if (M == E || - !M->isMoveImmediate() || - !M->getOperand(1).isImm() || - M->getOperand(1).getImm() != -1) - return Changed; - // First if sreg is only used in and instruction fold the immediate - // into that and. - if (!ReadsSreg && Op2.isKill()) { - A->getOperand(2).ChangeToImmediate(-1); - M->eraseFromParent(); - } - } - - if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && - MI.killsRegister(CondReg, TRI)) - A->eraseFromParent(); - - bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; - if (SReg == ExecReg) { - if (IsVCCZ) { - MI.eraseFromParent(); - return true; - } - MI.setDesc(TII->get(AMDGPU::S_BRANCH)); - } else { - MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ - : AMDGPU::S_CBRANCH_EXECNZ)); - } - - MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); - MI.addImplicitDefUseOperands(*MBB.getParent()); - - return true; -} - bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); SkipThreshold = SkipThresholdFlag; - bool HaveKill = false; + SmallVector<MachineInstr *, 4> KillInstrs; bool MadeChange = false; - // Track depth of exec mask, divergent branches. - SmallVector<MachineBasicBlock *, 16> ExecBranchStack; - - MachineFunction::iterator NextBB; - - MachineBasicBlock *EmptyMBBAtEnd = nullptr; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; BI = NextBB) { - NextBB = std::next(BI); - MachineBasicBlock &MBB = *BI; - bool HaveSkipBlock = false; - - if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { - // Reached convergence point for last divergent branch. - ExecBranchStack.pop_back(); - } - - if (HaveKill && ExecBranchStack.empty()) { - HaveKill = false; - - // TODO: Insert skip if exec is 0? - } - + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); - MachineInstr &MI = *I; switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - ExecBranchStack.push_back(MI.getOperand(0).getMBB()); - break; case AMDGPU::SI_MASK_BRANCH: - ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); break; @@ -478,64 +424,60 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { + assert(&MI == &MBB.back()); MI.eraseFromParent(); - } else if (HaveSkipBlock) { - // Remove the given unconditional branch when a skip block has been - // inserted after the current one and let skip the two instructions - // performing the kill if the exec mask is non-zero. - MI.eraseFromParent(); + MadeChange = true; } break; case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: - case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: { MadeChange = true; - kill(MI); - - if (ExecBranchStack.empty()) { - if (NextBB != BE && skipIfDead(MI, *NextBB)) { - HaveSkipBlock = true; - NextBB = std::next(BI); - BE = MF.end(); - } + bool CanKill = kill(MI); + + // Check if we can add an early "if exec=0 { end shader }". + // + // Note that we _always_ do this if it is correct, even if the kill + // happens fairly late in the shader, because the null export should + // generally still be cheaper than normal export(s). + // + // TODO: The dominatesAllReachable check is conservative: if the + // dominance is only missing due to _uniform_ branches, we could + // in fact insert the early-exit as well. + if (CanKill && + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && + dominatesAllReachable(MBB)) { + // Mark the instruction for kill-if-dead insertion. We delay this + // change because it modifies the CFG. + KillInstrs.push_back(&MI); } else { - HaveKill = true; + MI.eraseFromParent(); } - - MI.eraseFromParent(); break; + } - case AMDGPU::SI_RETURN_TO_EPILOG: - // FIXME: Should move somewhere else - assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at - // the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - I->eraseFromParent(); + case AMDGPU::SI_KILL_CLEANUP: + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && + dominatesAllReachable(MBB)) { + KillInstrs.push_back(&MI); + } else { + MI.eraseFromParent(); } break; - case AMDGPU::S_CBRANCH_VCCZ: - case AMDGPU::S_CBRANCH_VCCNZ: - MadeChange |= optimizeVccBranch(MI); - break; - default: break; } } } + for (MachineInstr *Kill : KillInstrs) { + skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), + Kill->getDebugLoc()); + Kill->eraseFromParent(); + } + KillInstrs.clear(); + EarlyExitBlock = nullptr; + return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ef662d55cb0a9..2a157eb20ab47 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -32,6 +32,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -57,7 +58,6 @@ #include <cstring> #include <memory> #include <utility> -#include <vector> using namespace llvm; @@ -109,15 +109,13 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { enum_iterator<InstCounterType>(NUM_INST_CNTS)); } -using RegInterval = std::pair<signed, signed>; +using RegInterval = std::pair<int, int>; struct { - uint32_t VmcntMax; - uint32_t ExpcntMax; - uint32_t LgkmcntMax; - uint32_t VscntMax; - int32_t NumVGPRsMax; - int32_t NumSGPRsMax; + unsigned VmcntMax; + unsigned ExpcntMax; + unsigned LgkmcntMax; + unsigned VscntMax; } HardwareLimits; struct { @@ -143,7 +141,7 @@ enum WaitEventType { NUM_WAIT_EVENTS, }; -static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { +static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | (1 << SQ_MESSAGE), @@ -166,6 +164,28 @@ enum RegisterMapping { NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; +// Enumerate different types of result-returning VMEM operations. Although +// s_waitcnt orders them all with a single vmcnt counter, in the absence of +// s_waitcnt only instructions of the same VmemType are guaranteed to write +// their results in order -- so there is no need to insert an s_waitcnt between +// two instructions of the same type that write the same vgpr. +enum VmemType { + // BUF instructions and MIMG instructions without a sampler. + VMEM_NOSAMPLER, + // MIMG instructions with a sampler. + VMEM_SAMPLER, +}; + +VmemType getVmemType(const MachineInstr &Inst) { + assert(SIInstrInfo::isVMEM(Inst)); + if (!SIInstrInfo::isMIMG(Inst)) + return VMEM_NOSAMPLER; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); + return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler + ? VMEM_SAMPLER + : VMEM_NOSAMPLER; +} + void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { switch (T) { case VM_CNT: @@ -195,12 +215,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { - for (auto T : inst_counter_types()) - memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } + WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {} - static uint32_t getWaitCountMax(InstCounterType T) { + static unsigned getWaitCountMax(InstCounterType T) { switch (T) { case VM_CNT: return HardwareLimits.VmcntMax; @@ -216,17 +233,13 @@ public: return 0; } - uint32_t getScoreLB(InstCounterType T) const { + unsigned getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return 0; return ScoreLBs[T]; } - uint32_t getScoreUB(InstCounterType T) const { + unsigned getScoreUB(InstCounterType T) const { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return 0; return ScoreUBs[T]; } @@ -242,7 +255,7 @@ public: return EXP_CNT; } - uint32_t getRegScore(int GprNo, InstCounterType T) { + unsigned getRegScore(int GprNo, InstCounterType T) { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } @@ -250,30 +263,16 @@ public: return SgprScores[GprNo - NUM_ALL_VGPRS]; } - void clear() { - memset(ScoreLBs, 0, sizeof(ScoreLBs)); - memset(ScoreUBs, 0, sizeof(ScoreUBs)); - PendingEvents = 0; - memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents)); - for (auto T : inst_counter_types()) - memset(VgprScores[T], 0, sizeof(VgprScores[T])); - memset(SgprScores, 0, sizeof(SgprScores)); - } - bool merge(const WaitcntBrackets &Other); RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, unsigned OpNo, - bool Def) const; - - int32_t getMaxVGPR() const { return VgprUB; } - int32_t getMaxSGPR() const { return SgprUB; } + const SIRegisterInfo *TRI, unsigned OpNo) const; bool counterOutOfOrder(InstCounterType T) const; bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - void determineWait(InstCounterType T, uint32_t ScoreToWait, + void determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); @@ -286,6 +285,12 @@ public: return PendingEvents & (1 << E); } + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = PendingEvents & WaitEventMaskForInst[T]; + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + bool hasPendingFlat() const { return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || @@ -298,71 +303,77 @@ public: LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } + // Return true if there might be pending writes to the specified vgpr by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { + assert(GprNo < NUM_ALL_VGPRS); + return VgprVmemTypes[GprNo] & ~(1 << V); + } + + void clearVgprVmemTypes(int GprNo) { + assert(GprNo < NUM_ALL_VGPRS); + VgprVmemTypes[GprNo] = 0; + } + void print(raw_ostream &); void dump() { print(dbgs()); } private: struct MergeInfo { - uint32_t OldLB; - uint32_t OtherLB; - uint32_t MyShift; - uint32_t OtherShift; + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; }; - static bool mergeScore(const MergeInfo &M, uint32_t &Score, - uint32_t OtherScore); + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); - void setScoreLB(InstCounterType T, uint32_t Val) { + void setScoreLB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; ScoreLBs[T] = Val; } - void setScoreUB(InstCounterType T, uint32_t Val) { + void setScoreUB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; ScoreUBs[T] = Val; if (T == EXP_CNT) { - uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); + unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) ScoreLBs[T] = UB; } } - void setRegScore(int GprNo, InstCounterType T, uint32_t Val) { + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { if (GprNo < NUM_ALL_VGPRS) { - if (GprNo > VgprUB) { - VgprUB = GprNo; - } + VgprUB = std::max(VgprUB, GprNo); VgprScores[T][GprNo] = Val; } else { assert(T == LGKM_CNT); - if (GprNo - NUM_ALL_VGPRS > SgprUB) { - SgprUB = GprNo - NUM_ALL_VGPRS; - } + SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); SgprScores[GprNo - NUM_ALL_VGPRS] = Val; } } void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, - unsigned OpNo, uint32_t Val); + unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; - uint32_t ScoreLBs[NUM_INST_CNTS] = {0}; - uint32_t ScoreUBs[NUM_INST_CNTS] = {0}; - uint32_t PendingEvents = 0; - bool MixedPendingEvents[NUM_INST_CNTS] = {false}; + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; // Remember the last flat memory operation. - uint32_t LastFlat[NUM_INST_CNTS] = {0}; + unsigned LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int32_t VgprUB = 0; - int32_t SgprUB = 0; - uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. - uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -385,8 +396,7 @@ private: explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} }; - std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index - DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap; + MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 // because of amdgpu-waitcnt-forcezero flag @@ -464,10 +474,10 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, - unsigned OpNo, bool Def) const { + unsigned OpNo) const { const MachineOperand &Op = MI->getOperand(OpNo); - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) + assert(Op.isReg()); + if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -475,29 +485,27 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, assert(!Op.getSubReg() || !Op.isUndef()); RegInterval Result; - const MachineRegisterInfo &MRIA = *MRI; unsigned Reg = TRI->getEncodingValue(Op.getReg()); - if (TRI->isVGPR(MRIA, Op.getReg())) { + if (TRI->isVGPR(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); Result.first = Reg - RegisterEncoding.VGPR0; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); - } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { + } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; assert(Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); } // TODO: Handle TTMP - // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... + // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... else return {-1, -1}; - const MachineInstr &MIA = *MI; - const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); + const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); unsigned Size = TRI->getRegSizeInBits(*RC); - Result.second = Result.first + (Size / 32); + Result.second = Result.first + ((Size + 16) / 32); return Result; } @@ -506,13 +514,10 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, unsigned OpNo, - uint32_t Val) { - RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); - LLVM_DEBUG({ - const MachineOperand &Opnd = MI->getOperand(OpNo); - assert(TRI->isVGPR(*MRI, Opnd.getReg())); - }); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + unsigned Val) { + RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); + assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg())); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, EXP_CNT, Val); } } @@ -521,19 +526,14 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - const MachineRegisterInfo &MRIA = *MRI; InstCounterType T = eventCounter(E); - uint32_t CurrScore = getScoreUB(T) + 1; + unsigned CurrScore = getScoreUB(T) + 1; if (CurrScore == 0) report_fatal_error("InsertWaitcnt score wraparound"); // PendingEvents and ScoreUB need to be update regardless if this event // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - if (!hasPendingEvent(E)) { - if (PendingEvents & WaitEventMaskForInst[T]) - MixedPendingEvents[T] = true; - PendingEvents |= 1 << E; - } + PendingEvents |= 1 << E; setScoreUB(T, CurrScore); if (T == EXP_CNT) { @@ -574,7 +574,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { const MachineOperand &Op = Inst.getOperand(I); - if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { + if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -622,7 +622,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &DefMO = Inst.getOperand(I); if (DefMO.isReg() && DefMO.isDef() && - TRI->isVGPR(MRIA, DefMO.getReg())) { + TRI->isVGPR(*MRI, DefMO.getReg())) { setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, CurrScore); } @@ -630,7 +630,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &MO = Inst.getOperand(I); - if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { + if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -641,8 +641,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); unsigned OpNo;//TODO: find the OpNo for this operand; - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); - for (signed RegNo = Interval.first; RegNo < Interval.second; + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); } @@ -650,10 +650,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } else { // Match the score to the destination registers. for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); - if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) + auto &Op = Inst.getOperand(I); + if (!Op.isReg() || !Op.isDef()) continue; - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); + if (T == VM_CNT) { + if (Interval.first >= NUM_ALL_VGPRS) + continue; + if (SIInstrInfo::isVMEM(Inst)) { + VmemType V = getVmemType(Inst); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) + VgprVmemTypes[RegNo] |= 1 << V; + } + } + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, T, CurrScore); } } @@ -666,8 +676,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; for (auto T : inst_counter_types()) { - uint32_t LB = getScoreLB(T); - uint32_t UB = getScoreUB(T); + unsigned LB = getScoreLB(T); + unsigned UB = getScoreUB(T); switch (T) { case VM_CNT: @@ -689,11 +699,11 @@ void WaitcntBrackets::print(raw_ostream &OS) { if (LB < UB) { // Print vgpr scores. - for (int J = 0; J <= getMaxVGPR(); J++) { - uint32_t RegScore = getRegScore(J, T); + for (int J = 0; J <= VgprUB; J++) { + unsigned RegScore = getRegScore(J, T); if (RegScore <= LB) continue; - uint32_t RelScore = RegScore - LB - 1; + unsigned RelScore = RegScore - LB - 1; if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { OS << RelScore << ":v" << J << " "; } else { @@ -702,11 +712,11 @@ void WaitcntBrackets::print(raw_ostream &OS) { } // Also need to print sgpr scores for lgkm_cnt. if (T == LGKM_CNT) { - for (int J = 0; J <= getMaxSGPR(); J++) { - uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + for (int J = 0; J <= SgprUB; J++) { + unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); if (RegScore <= LB) continue; - uint32_t RelScore = RegScore - LB - 1; + unsigned RelScore = RegScore - LB - 1; OS << RelScore << ":s" << J << " "; } } @@ -727,8 +737,8 @@ bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { - const uint32_t LB = getScoreLB(T); - const uint32_t UB = getScoreUB(T); + const unsigned LB = getScoreLB(T); + const unsigned UB = getScoreUB(T); if (Count < UB && UB - Count > LB) return true; @@ -736,12 +746,12 @@ bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, return false; } -void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, +void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const { // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. - const uint32_t LB = getScoreLB(T); - const uint32_t UB = getScoreUB(T); + const unsigned LB = getScoreLB(T); + const unsigned UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == VM_CNT || T == LGKM_CNT) && hasPendingFlat() && @@ -758,7 +768,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, } else { // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. - uint32_t NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -772,7 +782,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { - const uint32_t UB = getScoreUB(T); + const unsigned UB = getScoreUB(T); if (Count >= UB) return; if (Count != 0) { @@ -781,7 +791,6 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - MixedPendingEvents[T] = false; PendingEvents &= ~WaitEventMaskForInst[T]; } } @@ -792,7 +801,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) return true; - return MixedPendingEvents[T]; + return hasMixedPendingEvents(T); } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, @@ -954,10 +963,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( int CallAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval( - &MI, TII, MRI, TRI, CallAddrOpIdx, false); + RegInterval CallAddrOpInterval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); - for (signed RegNo = CallAddrOpInterval.first; + for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); @@ -965,10 +974,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); if (RtnAddrOpIdx != -1) { - RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval( - &MI, TII, MRI, TRI, RtnAddrOpIdx, false); + RegInterval RtnAddrOpInterval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); - for (signed RegNo = RtnAddrOpInterval.first; + for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); @@ -982,7 +991,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // emitted. // If the source operand was defined by a load, add the s_waitcnt // instruction. + // + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. for (const MachineMemOperand *Memop : MI.memoperands()) { + const Value *Ptr = Memop->getValue(); + if (Memop->isStore() && SLoadAddresses.count(Ptr)) { + addWait(Wait, LGKM_CNT, 0); + if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) + SLoadAddresses.erase(Ptr); + } unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -990,67 +1011,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } - - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &Op = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Op.getReg())) { - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); - } - } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. - - // Two cases are handled for destination operands: - // 1) If the destination operand was defined by a load, add the s_waitcnt - // instruction to guarantee the right WAW order. - // 2) If a destination operand that was used by a recent export/store ins, - // add s_waitcnt on exp_cnt to guarantee the WAR order. - if (MI.mayStore()) { - // FIXME: Should not be relying on memoperands. - for (const MachineMemOperand *Memop : MI.memoperands()) { - const Value *Ptr = Memop->getValue(); - if (SLoadAddresses.count(Ptr)) { - addWait(Wait, LGKM_CNT, 0); - if (PDT->dominates(MI.getParent(), - SLoadAddresses.find(Ptr)->second)) - SLoadAddresses.erase(Ptr); - } - unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - continue; - unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + if (Memop->isStore()) { ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } } + + // Loop over use and def operands. for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Def = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Def.getReg())) { - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(*MRI, Op.getReg())) { + // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the + // previous write and this write are the same type of VMEM + // instruction, in which case they're guaranteed to write their + // results in order anyway. + if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || + ScoreBrackets.hasOtherPendingVmemTypes(RegNo, + getVmemType(MI))) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.clearVgprVmemTypes(RegNo); + } + if (Op.isDef()) { + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + } } ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } // End of for loop that looks at all dest operands. + } } } @@ -1154,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *II << '\n'); if (!Wait.hasWait()) @@ -1171,7 +1166,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *SWaitInst << '\n'); } @@ -1187,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *SWaitInst << '\n'); } @@ -1303,10 +1298,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } -bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, - uint32_t OtherScore) { - uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; - uint32_t OtherShifted = +bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore) { + unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; + unsigned OtherShifted = OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; Score = std::max(MyShifted, OtherShifted); return OtherShifted > MyShifted; @@ -1320,44 +1315,50 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { bool StrictDom = false; + VgprUB = std::max(VgprUB, Other.VgprUB); + SgprUB = std::max(SgprUB, Other.SgprUB); + for (auto T : inst_counter_types()) { // Merge event flags for this counter const bool OldOutOfOrder = counterOutOfOrder(T); - const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T]; - const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; + const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; + const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) StrictDom = true; - if (Other.MixedPendingEvents[T] || - (OldEvents && OtherEvents && OldEvents != OtherEvents)) - MixedPendingEvents[T] = true; PendingEvents |= OtherEvents; // Merge scores for this counter - const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T]; - const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; + const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); + if (NewUB < ScoreLBs[T]) + report_fatal_error("waitcnt score overflow"); + MergeInfo M; M.OldLB = ScoreLBs[T]; M.OtherLB = Other.ScoreLBs[T]; - M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0; - M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift; + M.MyShift = NewUB - ScoreUBs[T]; + M.OtherShift = NewUB - Other.ScoreUBs[T]; - const uint32_t NewUB = ScoreUBs[T] + M.MyShift; - if (NewUB < ScoreUBs[T]) - report_fatal_error("waitcnt score overflow"); ScoreUBs[T] = NewUB; - ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift); StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); bool RegStrictDom = false; - for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E; - J++) { + for (int J = 0; J <= VgprUB; J++) { RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); } + if (T == VM_CNT) { + for (int J = 0; J <= VgprUB; J++) { + unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; + RegStrictDom |= NewVmemTypes != VgprVmemTypes[J]; + VgprVmemTypes[J] = NewVmemTypes; + } + } + if (T == LGKM_CNT) { - for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1; - J != E; J++) { + for (int J = 0; J <= SgprUB; J++) { RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } } @@ -1366,9 +1367,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { StrictDom = true; } - VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR()); - SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR()); - return StrictDom; } @@ -1383,6 +1381,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets.dump(); }); + // Assume VCCZ is correct at basic block boundaries, unless and until we need + // to handle cases where that is not true. + bool VCCZCorrect = true; + // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; @@ -1402,13 +1404,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, continue; } - bool VCCZBugWorkAround = false; + // We might need to restore vccz to its correct value for either of two + // different reasons; see ST->hasReadVCCZBug() and + // ST->partialVCCWritesUpdateVCCZ(). + bool RestoreVCCZ = false; if (readsVCCZ(Inst)) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && - ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - if (ST->hasReadVCCZBug()) - VCCZBugWorkAround = true; + if (!VCCZCorrect) + RestoreVCCZ = true; + else if (ST->hasReadVCCZBug()) { + // There is a hardware bug on CI/SI where SMRD instruction may corrupt + // vccz bit, so when we detect that an instruction may read from a + // corrupt vccz bit, we need to: + // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD + // operations to complete. + // 2. Restore the correct value of vccz by writing the current value + // of vcc back to vcc. + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + RestoreVCCZ = true; + } } } @@ -1419,6 +1434,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } } + if (!ST->partialVCCWritesUpdateVCCZ()) { + // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + // Writes to vcc will fix it. + if (Inst.definesRegister(AMDGPU::VCC_LO) || + Inst.definesRegister(AMDGPU::VCC_HI)) + VCCZCorrect = false; + else if (Inst.definesRegister(AMDGPU::VCC)) + VCCZCorrect = true; + } + // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1444,7 +1469,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. - if (VCCZBugWorkAround) { + if (RestoreVCCZ) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. @@ -1452,6 +1477,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), TRI->getVCC()) .addReg(TRI->getVCC()); + VCCZCorrect = true; Modified = true; } @@ -1479,29 +1505,23 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; - HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); - HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); - assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); - assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); + unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); + unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); + assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); + assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); - RegisterEncoding.VGPRL = - RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; + RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1; RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); - RegisterEncoding.SGPRL = - RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1; TrackedWaitcntSet.clear(); - RpotIdxMap.clear(); BlockInfos.clear(); // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. - for (MachineBasicBlock *MBB : - ReversePostOrderTraversal<MachineFunction *>(&MF)) { - RpotIdxMap[MBB] = BlockInfos.size(); - BlockInfos.emplace_back(MBB); - } + for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) + BlockInfos.insert({MBB, BlockInfo(MBB)}); std::unique_ptr<WaitcntBrackets> Brackets; bool Modified = false; @@ -1509,12 +1529,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { do { Repeat = false; - for (BlockInfo &BI : BlockInfos) { + for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; + ++BII) { + BlockInfo &BI = BII->second; if (!BI.Dirty) continue; - unsigned Idx = std::distance(&*BlockInfos.begin(), &BI); - if (BI.Incoming) { if (!Brackets) Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); @@ -1524,7 +1544,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (!Brackets) Brackets = std::make_unique<WaitcntBrackets>(ST); else - Brackets->clear(); + *Brackets = WaitcntBrackets(ST); } Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); @@ -1533,11 +1553,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (Brackets->hasPending()) { BlockInfo *MoveBracketsToSucc = nullptr; for (MachineBasicBlock *Succ : BI.MBB->successors()) { - unsigned SuccIdx = RpotIdxMap[Succ]; - BlockInfo &SuccBI = BlockInfos[SuccIdx]; + auto SuccBII = BlockInfos.find(Succ); + BlockInfo &SuccBI = SuccBII->second; if (!SuccBI.Incoming) { SuccBI.Dirty = true; - if (SuccIdx <= Idx) + if (SuccBII <= BII) Repeat = true; if (!MoveBracketsToSucc) { MoveBracketsToSucc = &SuccBI; @@ -1546,7 +1566,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } else if (SuccBI.Incoming->merge(*Brackets)) { SuccBI.Dirty = true; - if (SuccIdx <= Idx) + if (SuccBII <= BII) Repeat = true; } } @@ -1612,13 +1632,15 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); if (ST->hasVscnt()) - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); Modified = true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 4dcbe92861f23..428c21c896d50 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -114,6 +114,9 @@ class InstSI <dag outs, dag ins, string asm = "", // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions. field bit IsNonFlatSeg = 0; + // Reads the mode register, usually for FP environment. + field bit ReadsModeReg = 0; + // This bit indicates that this uses the floating point double precision // rounding mode flags field bit FPDPRounding = 0; @@ -303,7 +306,7 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe { bits<3> dim; bits<2> nsa; bits<1> dlc; - bits<1> a16 = 0; // TODO: this should be an operand + bits<1> a16; let Inst{0} = op{7}; let Inst{2-1} = nsa; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d53950ca44655..9af8ffedce0f3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,6 +63,8 @@ using namespace llvm; +#define DEBUG_TYPE "si-instr-info" + #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenInstrInfo.inc" @@ -83,6 +85,12 @@ static cl::opt<unsigned> BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); +static cl::opt<bool> Fix16BitCopies( + "amdgpu-fix-16-bit-physreg-copies", + cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), + cl::init(true), + cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -136,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_ACCVGPR_READ_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: // No implicit operands. return MI.getNumOperands() == MI.getDesc().getNumOperands(); default: @@ -258,43 +268,49 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool SIInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; unsigned Opc = LdSt.getOpcode(); + OffsetIsScalable = false; + const MachineOperand *BaseOp, *OffsetOp; + int DataOpIdx; if (isDS(LdSt)) { - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - if (OffsetImm) { + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); + OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); + if (OffsetOp) { // Normal, single offset LDS instruction. - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); - // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to - // report that here? - if (!BaseOp || !BaseOp->isReg()) + if (!BaseOp) { + // DS_CONSUME/DS_APPEND use M0 for the base address. + // TODO: find the implicit use operand for M0 and use that as BaseOp? + return false; + } + BaseOps.push_back(BaseOp); + Offset = OffsetOp->getImm(); + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + Width = getOpSize(LdSt, DataOpIdx); + } else { + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. + // We will use this for some partially aligned loads. + const MachineOperand *Offset0Op = + getNamedOperand(LdSt, AMDGPU::OpName::offset0); + const MachineOperand *Offset1Op = + getNamedOperand(LdSt, AMDGPU::OpName::offset1); + + unsigned Offset0 = Offset0Op->getImm(); + unsigned Offset1 = Offset1Op->getImm(); + if (Offset0 + 1 != Offset1) return false; - Offset = OffsetImm->getImm(); - - return true; - } - - // The 2 offset instructions use offset0 and offset1 instead. We can treat - // these as a load with a single offset if the 2 offsets are consecutive. We - // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = - getNamedOperand(LdSt, AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = - getNamedOperand(LdSt, AMDGPU::OpName::offset1); - - uint8_t Offset0 = Offset0Imm->getImm(); - uint8_t Offset1 = Offset1Imm->getImm(); - - if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { // Each of these offsets is in element sized units, so we need to convert // to bytes of the individual reads. @@ -310,16 +326,20 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, if (isStride64(Opc)) EltSize *= 64; - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); - if (!BaseOp->isReg()) - return false; - + BaseOps.push_back(BaseOp); Offset = EltSize * Offset0; - - return true; + // Get appropriate operand(s), and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) { + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + Width = getOpSize(LdSt, DataOpIdx); + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); + Width += getOpSize(LdSt, DataOpIdx); + } else { + Width = getOpSize(LdSt, DataOpIdx); + } } - - return false; + return true; } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { @@ -339,59 +359,78 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseOp = SOffset; + BaseOps.push_back(RSrc); + BaseOps.push_back(SOffset); Offset = OffsetImm->getImm(); - return true; - } - - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (!AddrReg) - return false; + } else { + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); + if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL + return false; + BaseOps.push_back(BaseOp); - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseOp = AddrReg; - Offset = OffsetImm->getImm(); - if (SOffset) // soffset can be an inline immediate. - Offset += SOffset->getImm(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (BaseOp) + BaseOps.push_back(BaseOp); - if (!BaseOp->isReg()) - return false; + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); + Offset = OffsetImm->getImm(); + if (SOffset) // soffset can be an inline immediate. + Offset += SOffset->getImm(); + } + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); + return true; + } + if (isMIMG(LdSt)) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx >= 0) { + // GFX10 possible NSA encoding. + for (int I = VAddr0Idx; I < SRsrcIdx; ++I) + BaseOps.push_back(&LdSt.getOperand(I)); + } else { + BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); + } + Offset = 0; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); return true; } if (isSMRD(LdSt)) { - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - if (!OffsetImm) + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + if (!BaseOp) // e.g. S_MEMTIME return false; - - const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); - BaseOp = SBaseReg; - Offset = OffsetImm->getImm(); - if (!BaseOp->isReg()) - return false; - + BaseOps.push_back(BaseOp); + OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); + Offset = OffsetOp ? OffsetOp->getImm() : 0; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); + Width = getOpSize(LdSt, DataOpIdx); return true; } if (isFLAT(LdSt)) { - const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (VAddr) { - // Can't analyze 2 offsets. - if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) - return false; - - BaseOp = VAddr; - } else { - // scratch instructions have either vaddr or saddr. - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); - } - + // Instructions have either vaddr or saddr or both. + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (BaseOp) + BaseOps.push_back(BaseOp); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); + if (BaseOp) + BaseOps.push_back(BaseOp); Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); - if (!BaseOp->isReg()) - return false; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); return true; } @@ -399,15 +438,13 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, } static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, - const MachineOperand &BaseOp1, + ArrayRef<const MachineOperand *> BaseOps1, const MachineInstr &MI2, - const MachineOperand &BaseOp2) { - // Support only base operands with base registers. - // Note: this could be extended to support FI operands. - if (!BaseOp1.isReg() || !BaseOp2.isReg()) - return false; - - if (BaseOp1.isIdenticalTo(BaseOp2)) + ArrayRef<const MachineOperand *> BaseOps2) { + // Only examine the first "base" operand of each instruction, on the + // assumption that it represents the real base address of the memory access. + // Other operands are typically offsets or indices from this base address. + if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) return true; if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) @@ -433,62 +470,31 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const { - const MachineInstr &FirstLdSt = *BaseOp1.getParent(); - const MachineInstr &SecondLdSt = *BaseOp2.getParent(); - - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) - return false; - - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 6; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) +bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, + unsigned NumLoads, + unsigned NumBytes) const { + // If current mem ops pair do not have same base pointer, then they cannot be + // clustered. + assert(!BaseOps1.empty() && !BaseOps2.empty()); + const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); + const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; + // Compute max cluster size based on average number bytes clustered till now, + // and decide based on it, if current mem ops pair can be clustered or not. + assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && + "Invalid NumLoads/NumBytes values"); + unsigned MaxNumLoads; + if (NumBytes <= 4 * NumLoads) { + // Loads are dword or smaller (on average). + MaxNumLoads = 5; + } else { + // Loads are bigger than a dword (on average). + MaxNumLoads = 4; + } + return NumLoads <= MaxNumLoads; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, @@ -516,11 +522,10 @@ bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, - MCRegister SrcReg, bool KillSrc) { + MCRegister SrcReg, bool KillSrc, + const char *Msg = "illegal SGPR to VGPR copy") { MachineFunction *MF = MBB.getParent(); - DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), - "illegal SGPR to VGPR copy", - DL, DS_Error); + DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); LLVMContext &C = MF->getFunction().getContext(); C.diagnose(IllegalCopy); @@ -534,6 +539,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); + // FIXME: This is hack to resolve copies between 16 bit and 32 bit + // registers until all patterns are fixed. + if (Fix16BitCopies && + ((RI.getRegSizeInBits(*RC) == 16) ^ + (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { + MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; + MCRegister Super = RI.get32BitRegister(RegToFix); + assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); + RegToFix = Super; + + if (DestReg == SrcReg) { + // Insert empty bundle since ExpandPostRA expects an instruction here. + BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); + return; + } + + RC = RI.getPhysRegClass(DestReg); + } + if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || @@ -580,6 +604,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RC == &AMDGPU::SReg_64RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) + .addImm(1) + .addImm(0); + return; + } + if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -606,10 +637,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (DestReg == AMDGPU::SCC) { + // Copying 64-bit or 32-bit sources to SCC barely makes sense, + // but SelectionDAG emits such copies for i1 sources. + // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit. + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0); + } assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0); + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + return; } @@ -660,7 +699,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Registers in the sequence are allocated contiguously so we can just // use register number to pick one of three round-robin temps. unsigned RegNo = DestReg % 3; - unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); if (!Tmp) report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); RS.setRegUsed(Tmp); @@ -685,6 +724,72 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (RI.getRegSizeInBits(*RC) == 16) { + assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || + AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || + AMDGPU::SReg_LO16RegClass.contains(SrcReg) || + AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); + + bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); + bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); + bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); + bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); + bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || + AMDGPU::SReg_LO16RegClass.contains(DestReg) || + AMDGPU::AGPR_LO16RegClass.contains(DestReg); + bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || + AMDGPU::SReg_LO16RegClass.contains(SrcReg) || + AMDGPU::AGPR_LO16RegClass.contains(SrcReg); + MCRegister NewDestReg = RI.get32BitRegister(DestReg); + MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); + + if (IsSGPRDst) { + if (!IsSGPRSrc) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) + .addReg(NewSrcReg, getKillRegState(KillSrc)); + return; + } + + if (IsAGPRDst || IsAGPRSrc) { + if (!DstLow || !SrcLow) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, + "Cannot use hi16 subreg with an AGPR!"); + } + + copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); + return; + } + + if (IsSGPRSrc && !ST.hasSDWAScalar()) { + if (!DstLow || !SrcLow) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, + "Cannot use hi16 subreg on VI!"); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) + .addReg(NewSrcReg, getKillRegState(KillSrc)); + return; + } + + auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) + .addImm(0) // src0_modifiers + .addReg(NewSrcReg) + .addImm(0) // clamp + .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1) + .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) + .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1) + .addReg(NewDestReg, RegState::Implicit | RegState::Undef); + // First implicit operand is $exec. + MIB->tieOperands(0, MIB->getNumOperands() - 1); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { @@ -806,7 +911,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, int64_t IdxValue = Idx == 0 ? Value : 0; MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, Idx)); + get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); Builder.addImm(IdxValue); } } @@ -818,10 +923,10 @@ SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, + const DebugLoc &DL, Register DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, - unsigned FalseReg) const { + Register TrueReg, + Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -944,10 +1049,10 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, } } -unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, +Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const { + Register SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) @@ -957,10 +1062,10 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, return Reg; } -unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, +Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const { + Register SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) @@ -984,6 +1089,80 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } +static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { + if (VecSize <= 32) // 4 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; + if (VecSize <= 64) // 8 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; + if (VecSize <= 96) // 12 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; + if (VecSize <= 128) // 16 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; + if (VecSize <= 160) // 20 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; + if (VecSize <= 256) // 32 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; + if (VecSize <= 512) // 64 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { + if (VecSize <= 32) // 4 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; + if (VecSize <= 64) // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; + if (VecSize <= 96) // 12 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; + if (VecSize <= 128) // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; + if (VecSize <= 160) // 20 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; + if (VecSize <= 256) // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; + if (VecSize <= 512) // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { + if (VecSize <= 64) // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; + if (VecSize <= 128) // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; + if (VecSize <= 256) // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; + if (VecSize <= 512) // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( + unsigned VecSize, unsigned EltSize, bool IsSGPR) const { + if (IsSGPR) { + switch (EltSize) { + case 32: + return get(getIndirectSGPRWritePseudo32(VecSize)); + case 64: + return get(getIndirectSGPRWritePseudo64(VecSize)); + default: + llvm_unreachable("invalid reg indexing elt size"); + } + } + + assert(EltSize == 32 && "invalid reg indexing elt size"); + return get(getIndirectVGPRWritePseudoOpc(VecSize)); +} + static unsigned getSGPRSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -996,6 +1175,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S128_SAVE; case 20: return AMDGPU::SI_SPILL_S160_SAVE; + case 24: + return AMDGPU::SI_SPILL_S192_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: @@ -1019,6 +1200,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V128_SAVE; case 20: return AMDGPU::SI_SPILL_V160_SAVE; + case 24: + return AMDGPU::SI_SPILL_V192_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: @@ -1049,7 +1232,7 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) { void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, + Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { @@ -1058,18 +1241,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned Size = FrameInfo.getObjectSize(FrameIndex); - unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - Size, Align); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), + FrameInfo.getObjectAlign(FrameIndex)); unsigned SpillSize = TRI->getSpillSize(*RC); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); + assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && + SrcReg != AMDGPU::EXEC && "exec should not be spilled"); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. @@ -1079,7 +1262,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // to make sure we are using the correct register class. if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } BuildMI(MBB, MI, DL, OpDesc) @@ -1126,6 +1309,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S128_RESTORE; case 20: return AMDGPU::SI_SPILL_S160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_S192_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: @@ -1149,6 +1334,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V128_RESTORE; case 20: return AMDGPU::SI_SPILL_V160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_V192_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: @@ -1179,33 +1366,34 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, + Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); - unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned SpillSize = TRI->getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, Size, Align); + PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), + FrameInfo.getObjectAlign(FrameIndex)); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); + assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && + DestReg != AMDGPU::EXEC && "exec should not be spilled"); // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); - if (Register::isVirtualRegister(DestReg) && SpillSize == 4) { + if (DestReg.isVirtual() && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } if (RI.spillSGPRToVGPR()) @@ -1244,7 +1432,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); - unsigned TIDReg = MFI->getTIDReg(); + Register TIDReg = MFI->getTIDReg(); if (!MFI->hasCalculatedTID()) { MachineBasicBlock &Entry = MBB.getParent()->front(); MachineBasicBlock::iterator Insert = Entry.front(); @@ -1272,8 +1460,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? - unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) .addReg(InputPtrReg) .addImm(SI::KernelInputOffsets::NGROUPS_Z); @@ -1482,30 +1670,55 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_MOVRELD_B32_V1: - case AMDGPU::V_MOVRELD_B32_V2: - case AMDGPU::V_MOVRELD_B32_V4: - case AMDGPU::V_MOVRELD_B32_V8: - case AMDGPU::V_MOVRELD_B32_V16: { - const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { + const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); + + unsigned Opc; + if (RI.hasVGPRs(EltRC)) { + Opc = ST.useVGPRIndexMode() ? + AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; + } else { + Opc = RI.getRegSizeInBits(*EltRC) == 64 ? + AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; + } + + const MCInstrDesc &OpDesc = get(Opc); Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); - unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); + unsigned SubReg = MI.getOperand(3).getImm(); assert(VecReg == MI.getOperand(1).getReg()); - MachineInstr *MovRel = - BuildMI(MBB, MI, DL, MovRelDesc) - .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .add(MI.getOperand(2)) - .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, - RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, OpDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .add(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); const int ImpDefIdx = - MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); + OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); const int ImpUseIdx = ImpDefIdx + 1; - MovRel->tieOperands(ImpDefIdx, ImpUseIdx); - + MIB->tieOperands(ImpDefIdx, ImpUseIdx); MI.eraseFromParent(); break; } @@ -1549,22 +1762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case TargetOpcode::BUNDLE: { - if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) - return false; - - // If it is a load it must be a memory clause - for (MachineBasicBlock::instr_iterator I = MI.getIterator(); - I->isBundledWithSucc(); ++I) { - I->unbundleFromSucc(); - for (MachineOperand &MO : I->operands()) - if (MO.isReg()) - MO.setIsInternalRead(false); - } - - MI.eraseFromParent(); - break; - } } return true; } @@ -1662,9 +1859,15 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, RegOp.ChangeToImmediate(NonRegOp.getImm()); else if (NonRegOp.isFI()) RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); - else + else if (NonRegOp.isGlobal()) { + RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), + NonRegOp.getTargetFlags()); + } else return nullptr; + // Make sure we don't reinterpret a subreg index in the target flags. + RegOp.setTargetFlags(NonRegOp.getTargetFlags()); + NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); NonRegOp.setSubReg(SubReg); @@ -2085,6 +2288,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, // Copy the flags onto the implicit condition register operand. preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); + fixImplicitOperands(*CondBr); if (BytesAdded) *BytesAdded = 4; @@ -2125,8 +2329,8 @@ bool SIInstrInfo::reverseBranchCondition( bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, + Register DstReg, Register TrueReg, + Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { switch (Cond[0].getImm()) { case VCCNZ: @@ -2165,8 +2369,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const { + Register DstReg, ArrayRef<MachineOperand> Cond, + Register TrueReg, Register FalseReg) const { BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); if (Pred == VCCZ || Pred == SCC_FALSE) { Pred = static_cast<BranchPredicate>(-Pred); @@ -2178,14 +2382,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, unsigned DstSize = RI.getRegSizeInBits(*DstRC); if (DstSize == 32) { - unsigned SelOp = Pred == SCC_TRUE ? - AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; - - // Instruction's operands are backwards from what is expected. - MachineInstr *Select = - BuildMI(MBB, I, DL, get(SelOp), DstReg) - .addReg(FalseReg) - .addReg(TrueReg); + MachineInstr *Select; + if (Pred == SCC_TRUE) { + Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) + .addReg(TrueReg) + .addReg(FalseReg); + } else { + // Instruction's operands are backwards from what is expected. + Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + } preserveCondRegFlags(Select->getOperand(3), Cond[1]); return; @@ -2194,8 +2401,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, if (DstSize == 64 && Pred == SCC_TRUE) { MachineInstr *Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) - .addReg(FalseReg) - .addReg(TrueReg); + .addReg(TrueReg) + .addReg(FalseReg); preserveCondRegFlags(Select->getOperand(3), Cond[1]); return; @@ -2239,17 +2446,26 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, I = MIB->getIterator(); - SmallVector<unsigned, 8> Regs; + SmallVector<Register, 8> Regs; for (int Idx = 0; Idx != NElts; ++Idx) { Register DstElt = MRI.createVirtualRegister(EltRC); Regs.push_back(DstElt); unsigned SubIdx = SubIndices[Idx]; - MachineInstr *Select = - BuildMI(MBB, I, DL, get(SelOp), DstElt) - .addReg(FalseReg, 0, SubIdx) - .addReg(TrueReg, 0, SubIdx); + MachineInstr *Select; + if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { + Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + } else { + Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(TrueReg, 0, SubIdx) + .addReg(FalseReg, 0, SubIdx); + } + preserveCondRegFlags(Select->getOperand(3), Cond[1]); fixImplicitOperands(*Select); @@ -2313,7 +2529,7 @@ static void removeModOperands(MachineInstr &MI) { } bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const { + Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; @@ -2339,15 +2555,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { - bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); + Register DstReg = UseMI.getOperand(0).getReg(); + bool Is16Bit = getOpSize(UseMI, 0) == 2; + bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { - if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + APInt Imm(32, ImmOp->getImm()); + + if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) + Imm = Imm.ashr(16); + + if (RI.isAGPR(*MRI, DstReg)) { + if (!isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; } + + if (Is16Bit) { + if (isVGPRCopy) + return false; // Do not clobber vgpr_hi16 + + if (DstReg.isVirtual() && + UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + return false; + + UseMI.getOperand(0).setSubReg(0); + if (DstReg.isPhysical()) { + DstReg = RI.get32BitRegister(DstReg); + UseMI.getOperand(0).setReg(DstReg); + } + assert(UseMI.getOperand(1).getReg().isVirtual()); + } + UseMI.setDesc(get(NewOpc)); - UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); + UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); + UseMI.getOperand(1).setTargetFlags(0); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); return true; } @@ -2517,6 +2758,18 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return false; } +static bool +memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2) { + if (BaseOps1.size() != BaseOps2.size()) + return false; + for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { + if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) + return false; + } + return true; +} + static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; @@ -2527,26 +2780,26 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, const MachineInstr &MIb) const { - const MachineOperand *BaseOp0, *BaseOp1; + SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; int64_t Offset0, Offset1; + unsigned Dummy0, Dummy1; + bool Offset0IsScalable, Offset1IsScalable; + if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, + Dummy0, &RI) || + !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, + Dummy1, &RI)) + return false; - if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && - getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { - if (!BaseOp0->isIdenticalTo(*BaseOp1)) - return false; + if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) + return false; - if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { - // FIXME: Handle ds_read2 / ds_write2. - return false; - } - unsigned Width0 = (*MIa.memoperands_begin())->getSize(); - unsigned Width1 = (*MIb.memoperands_begin())->getSize(); - if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { - return true; - } + if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; } - - return false; + unsigned Width0 = MIa.memoperands().front()->getSize(); + unsigned Width1 = MIb.memoperands().front()->getSize(); + return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); } bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, @@ -2586,7 +2839,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); + return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); } if (isFLAT(MIa)) { @@ -2732,16 +2985,30 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) { bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { - // XXX - Do we want the SP check in the base implementation? + // Skipping the check for SP writes in the base implementation. The reason it + // was added was apparently due to compile time concerns. + // + // TODO: Do we really want this barrier? It triggers unnecessary hazard nops + // but is probably avoidable. + + // Copied from base implementation. + // Terminators and labels can't be scheduled around. + if (MI.isTerminator() || MI.isPosition()) + return true; + + // INLINEASM_BR can jump to another block + if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) + return true; // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. - return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || - MI.modifiesRegister(AMDGPU::EXEC, &RI) || + + // TODO: Don't treat setreg with known constant that only changes MODE as + // barrier. + return MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || - MI.getOpcode() == AMDGPU::S_DENORM_MODE || changesVGPRIndexingMode(MI); } @@ -2755,6 +3022,20 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_GWS_BARRIER; } +bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { + // Skip the full operand and register alias search modifiesRegister + // does. There's only a handful of instructions that touch this, it's only an + // implicit def, and doesn't alias any other registers. + if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { + for (; ImpDef && *ImpDef; ++ImpDef) { + if (*ImpDef == AMDGPU::MODE) + return true; + } + } + + return false; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); @@ -2780,6 +3061,10 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const if (MI.isCall() || MI.isInlineAsm()) return true; // conservative assumption + // A mode change is a scalar operation that influences vector instructions. + if (modifiesModeRegister(MI)) + return true; + // These are like SALU instructions in terms of effects, so it's questionable // whether we should return true for those. // @@ -2866,10 +3151,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + // We would expect inline immediates to not be concerned with an integer/fp + // distinction. However, in the case of 16-bit integer operations, the + // "floating point" values appear to not work. It seems read the low 16-bits + // of 32-bit immediates, which happens to always work for the integer + // values. + // + // See llvm bugzilla 46302. + // + // TODO: Theoretically we could use op-sel to use the high bits of the + // 32-bit FP values. + return AMDGPU::isInlinableIntLiteral(Imm); + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + // This suffers the same problem as the scalar 16-bit cases. + return AMDGPU::isInlinableIntLiteralV216(Imm); + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets @@ -2883,11 +3184,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast<uint32_t>(Imm); return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); @@ -3056,7 +3354,8 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig) { for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { + if (Use.isUse() && + (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { Use.setIsUndef(Orig.isUndef()); Use.setIsKill(Orig.isKill()); return; @@ -3068,7 +3367,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, unsigned Op32) const { MachineBasicBlock *MBB = MI.getParent();; MachineInstrBuilder Inst32 = - BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) + .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. @@ -3138,7 +3438,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, } } -static unsigned findImplicitSGPRRead(const MachineInstr &MI) { +static Register findImplicitSGPRRead(const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) { // We only care about reads. if (MO.isDef()) @@ -3239,6 +3539,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } + if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { + ErrInfo = "missing memory operand from MIMG instruction."; + return false; + } + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI.getOperand(i).isFPImm()) { @@ -3446,8 +3751,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; - SmallVector<unsigned, 2> SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRRead(MI); + SmallVector<Register, 2> SGPRsUsed; + Register SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) { ++ConstantBusCount; SGPRsUsed.push_back(SGPRUsed); @@ -3482,7 +3787,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (isVOP3(MI) && LiteralCount) { - if (LiteralCount && !ST.hasVOP3Literal()) { + if (!ST.hasVOP3Literal()) { ErrInfo = "VOP3 instruction uses literal"; return false; } @@ -3665,11 +3970,34 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } + bool IsA16 = false; + if (ST.hasR128A16()) { + const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); + IsA16 = R128A16->getImm() != 0; + } else if (ST.hasGFX10A16()) { + const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); + IsA16 = A16->getImm() != 0; + } + + bool PackDerivatives = IsA16 || BaseOpcode->G16; bool IsNSA = SRsrcIdx - VAddr0Idx > 1; - unsigned AddrWords = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? Dim->NumGradients : 0) + - (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); + + unsigned AddrWords = BaseOpcode->NumExtraArgs; + unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (IsA16) + AddrWords += (AddrComponents + 1) / 2; + else + AddrWords += AddrComponents; + + if (BaseOpcode->Gradients) { + if (PackDerivatives) + // There are two gradients per coordinate, we pack them separately. + // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) + AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; + else + AddrWords += Dim->NumGradients; + } unsigned VAddrWords; if (IsNSA) { @@ -3681,14 +4009,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, AddrWords = 16; else if (AddrWords > 4) AddrWords = 8; - else if (AddrWords == 3 && VAddrWords == 4) { - // CodeGen uses the V4 variant of instructions for three addresses, - // because the selection DAG does not support non-power-of-two types. + else if (AddrWords == 4) AddrWords = 4; - } + else if (AddrWords == 3) + AddrWords = 3; } if (VAddrWords != AddrWords) { + LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords + << " but got " << VAddrWords << "\n"); ErrInfo = "bad vaddr size"; return false; } @@ -4217,7 +4546,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, } } -unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, +Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const { const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); @@ -5002,6 +5331,76 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; + + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register CarryInReg = Inst.getOperand(4).getReg(); + if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { + Register NewCarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + .addReg(CarryInReg); + } + + Register CarryOutReg = Inst.getOperand(1).getReg(); + + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(CarryOutReg, RegState::Define) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryInReg) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + continue; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); + + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_I32_e64 + : AMDGPU::V_SUB_I32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit + + legalizeOperands(*NewInstr, MDT); + + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); + Inst.eraseFromParent(); + } + continue; + + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -5142,6 +5541,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return false; } +void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + MachineOperand &Cond = Inst.getOperand(3); + + Register SCCSource = Cond.getReg(); + // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead. + if (!Cond.isUndef()) { + for (MachineInstr &CandI : + make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), + Inst.getParent()->rend())) { + if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != + -1) { + if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { + SCCSource = CandI.getOperand(1).getReg(); + } + break; + } + } + } + + // If this is a trivial select where the condition is effectively not SCC + // (SCCSource is a source of copy to SCC), then the select is semantically + // equivalent to copying SCCSource. Hence, there is no need to create + // V_CNDMASK, we can just use that and bail out. + if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) && + Src1.isImm() && (Src1.getImm() == 0)) { + MRI.replaceRegWith(Dest.getReg(), SCCSource); + return; + } + + const TargetRegisterClass *TC = ST.getWavefrontSize() == 64 + ? &AMDGPU::SReg_64_XEXECRegClass + : &AMDGPU::SReg_32_XM0_XEXECRegClass; + Register CopySCC = MRI.createVirtualRegister(TC); + + if (SCCSource == AMDGPU::SCC) { + // Insert a trivial select instead of creating a copy, because a copy from + // SCC would semantically mean just copying a single bit, but we may need + // the result to be a vector condition mask that needs preserving. + unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 + : AMDGPU::S_CSELECT_B32; + auto NewSelect = + BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); + } else { + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource); + } + + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + auto UpdatedInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(CopySCC); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + legalizeOperands(*UpdatedInst, MDT); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -5623,7 +6094,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } void SIInstrInfo::addUsersToMoveToVALUWorklist( - unsigned DstReg, + Register DstReg, MachineRegisterInfo &MRI, SetVectorType &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), @@ -5723,20 +6194,60 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, SetVectorType &Worklist) const { + bool SCCUsedImplicitly = false; + // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst); + SmallVector<MachineInstr *, 4> CopyToDelete; // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : // Skip the def inst itself. make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), SCCDefInst.getParent()->end())) { // Check if SCC is used first. - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) - Worklist.insert(&MI); + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { + if (MI.isCopy()) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + unsigned DestReg = MI.getOperand(0).getReg(); + + for (auto &User : MRI.use_nodbg_instructions(DestReg)) { + if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || + (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { + User.getOperand(4).setReg(RI.getVCC()); + Worklist.insert(&User); + } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { + User.getOperand(5).setReg(RI.getVCC()); + // No need to add to Worklist. + } + } + CopyToDelete.push_back(&MI); + } else { + if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || + MI.getOpcode() == AMDGPU::S_CSELECT_B64) { + // This is an implicit use of SCC and it is really expected by + // the SCC users to handle. + // We cannot preserve the edge to the user so add the explicit + // copy: SCC = COPY VCC. + // The copy will be cleaned up during the processing of the user + // in lowerSelect. + SCCUsedImplicitly = true; + } + + Worklist.insert(&MI); + } + } // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) - return; + break; + } + for (auto &Copy : CopyToDelete) + Copy->eraseFromParent(); + + if (SCCUsedImplicitly) { + BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()), + SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(RI.getVCC()); } } @@ -5789,7 +6300,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( } // Find the one SGPR operand we are allowed to use. -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, +Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { const MCInstrDesc &Desc = MI.getDesc(); @@ -5802,11 +6313,11 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, // // If the operand's class is an SGPR, we can never move it. - unsigned SGPRReg = findImplicitSGPRRead(MI); + Register SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; - unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + Register UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { @@ -5919,10 +6430,9 @@ bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { return isSMRD(Opc); } -bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { - unsigned Opc = MI.getOpcode(); - - return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +bool SIInstrInfo::isHighLatencyDef(int Opc) const { + return get(Opc).mayLoad() && + (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); } unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, @@ -6198,7 +6708,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DestReg) const { + Register DestReg) const { if (ST.hasAddNoCarry()) return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); @@ -6608,20 +7118,24 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( // %0 may even spill. We can't spill $m0 normally (it would require copying to // a numbered SGPR anyway), and since it is in the SReg_32 register class, // TargetInstrInfo::foldMemoryOperand() is going to try. + // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. if (MI.isFullCopy()) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - - if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; - } - - if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; + if ((DstReg.isVirtual() || SrcReg.isVirtual()) && + (DstReg.isVirtual() != SrcReg.isVirtual())) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; + const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); + if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); + return nullptr; + } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); + return nullptr; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b151a94b0d118..53e2ffba0f656 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -84,6 +84,9 @@ private: bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; + void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -119,7 +122,7 @@ private: MachineRegisterInfo &MRI, MachineInstr &Inst) const; - void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI, + void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SetVectorType &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineOperand &Op, @@ -132,7 +135,7 @@ private: bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, const MachineInstr &MIb) const; - unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; + Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: bool swapSourceModifiers(MachineInstr &MI, @@ -181,14 +184,15 @@ public: int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const final; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const override; + bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override; @@ -210,22 +214,22 @@ public: const TargetRegisterClass *getPreferredSelectRegClass( unsigned Size) const; - unsigned insertNE(MachineBasicBlock *MBB, + Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const; + Register SrcReg, int Value) const; - unsigned insertEQ(MachineBasicBlock *MBB, + Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const; + Register SrcReg, int Value) const; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, unsigned SrcReg, + MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, unsigned DestReg, + MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -244,6 +248,9 @@ public: // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + const MCInstrDesc &getIndirectRegWritePseudo( + unsigned VecSize, unsigned EltSize, bool IsSGPR) const; + LLVM_READONLY int commuteOpcode(unsigned Opc) const; @@ -293,20 +300,19 @@ public: SmallVectorImpl<MachineOperand> &Cond) const override; bool canInsertSelect(const MachineBasicBlock &MBB, - ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, + ArrayRef<MachineOperand> Cond, Register DstReg, + Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const override; + Register DstReg, ArrayRef<MachineOperand> Cond, + Register TrueReg, Register FalseReg) const override; void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const; + Register DstReg, ArrayRef<MachineOperand> Cond, + Register TrueReg, Register FalseReg) const; unsigned getAddressSpaceForPseudoSourceKind( unsigned Kind) const override; @@ -317,7 +323,7 @@ public: bool isFoldableCopy(const MachineInstr &MI) const; - bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final; unsigned getMachineCSELookAheadLimit() const override { return 500; } @@ -685,6 +691,9 @@ public: return MO.isReg() && RI.isVGPR(MRI, MO.getReg());}); } + /// Return true if the instruction modifies the mode register.q + static bool modifiesModeRegister(const MachineInstr &MI); + /// Whether we must prevent this instruction from executing with EXEC = 0. bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; @@ -824,11 +833,7 @@ public: const MachineOperand &MO = MI.getOperand(OpNo); if (MO.isReg()) { if (unsigned SubReg = MO.getSubReg()) { - assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg( - MI.getParent()->getParent()->getRegInfo(). - getRegClass(MO.getReg()), SubReg)) >= 32 && - "Sub-dword subregs are not supported"); - return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4; + return RI.getSubRegIdxSize(SubReg) / 8; } } return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; @@ -874,7 +879,7 @@ public: /// be used when it is know that the value in SrcReg is same across all /// threads in the wave. /// \returns The SGPR register that \p SrcReg was copied to. - unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const; void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; @@ -928,7 +933,7 @@ public: uint64_t getScratchRsrcWords23() const; bool isLowLatencyInstruction(const MachineInstr &MI) const; - bool isHighLatencyInstruction(const MachineInstr &MI) const; + bool isHighLatencyDef(int Opc) const override; /// Return the descriptor of the target-specific machine instruction /// that corresponds to the specified pseudo or native opcode. @@ -995,7 +1000,7 @@ public: MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DestReg) const; + Register DestReg) const; MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 85e8d0582dcd1..7aee52f913605 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -7,11 +7,9 @@ //===----------------------------------------------------------------------===// def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">, - AssemblerPredicate <"FeatureWavefrontSize32">; + AssemblerPredicate <(all_of FeatureWavefrontSize32)>; def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">, - AssemblerPredicate <"FeatureWavefrontSize64">; - -def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; + AssemblerPredicate <(all_of FeatureWavefrontSize64)>; class GCNPredicateControl : PredicateControl { Predicate SIAssemblerPredicate = isGFX6GFX7; @@ -30,6 +28,7 @@ def SIEncodingFamily { int GFX9 = 5; int GFX10 = 6; int SDWA10 = 7; + int GFX10_B = 8; } //===----------------------------------------------------------------------===// @@ -39,8 +38,7 @@ def SIEncodingFamily { def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", - SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>, - SDTCisVT<4, i1>]>, + SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, [SDNPMayLoad, SDNPMemOperand] >; @@ -57,6 +55,10 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -200,6 +202,7 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; +def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; @@ -267,7 +270,7 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", SDTypeProfile<0 ,1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue] + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] >; //===----------------------------------------------------------------------===// @@ -308,6 +311,10 @@ class isPackedType<ValueType SrcVT> { // PatFrags for global memory operations //===----------------------------------------------------------------------===// +let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in { +defm atomic_csub_global : binary_atomic_op<SIatomic_csub>; +} + foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { @@ -631,6 +638,16 @@ def add_ctpop : PatFrag < (add (ctpop $src0), $src1) >; +foreach I = 1-4 in { +def shl#I#_add : PatFrag < + (ops node:$src0, node:$src1), + (add (shl_oneuse $src0, (i32 I)), $src1)> { + // FIXME: Poor substitute for disabling pattern in SelectionDAG + let PredicateCode = [{return false;}]; + let GISelPredicateCode = [{return true;}]; +} +} + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, SDTypeProfile tc = SDTAtomic2, bit IsInt = 1> { @@ -651,6 +668,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>; defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; @@ -665,7 +683,7 @@ defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>; defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>; defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>; -def as_i1imm : SDNodeXForm<imm, [{ +def as_i1timm : SDNodeXForm<timm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); }]>; @@ -673,6 +691,10 @@ def as_i8imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); }]>; +def as_i8timm : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); +}]>; + def as_i16imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); }]>; @@ -766,7 +788,7 @@ def NegSubInlineConst32 : ImmLeaf<i32, [{ return Imm < -16 && Imm >= -64; }], NegateImm>; -def NegSubInlineConst16 : ImmLeaf<i16, [{ +def NegSubInlineIntConst16 : ImmLeaf<i16, [{ return Imm < -16 && Imm >= -64; }], NegateImm>; @@ -791,6 +813,26 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ }], getNegV2I16Imm>; //===----------------------------------------------------------------------===// +// MUBUF/SMEM Patterns +//===----------------------------------------------------------------------===// + +def extract_glc : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8); +}]>; + +def extract_slc : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); +}]>; + +def extract_dlc : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); +}]>; + +def extract_swz : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + +//===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -935,7 +977,7 @@ def VOPDstS64orS32 : BoolRC { } // SCSrc_i1 is the operand for pseudo instructions only. -// Boolean immeadiates shall not be exposed to codegen instructions. +// Boolean immediates shall not be exposed to codegen instructions. def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_REG_IMM_INT32"; @@ -1067,6 +1109,7 @@ def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; +def GFX10A16 : NamedOperandBit<"GFX10A16", NamedMatchClass<"GFX10A16">>; def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; @@ -1099,9 +1142,9 @@ def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>; -def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; +def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>; -def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { +def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { } @@ -1274,19 +1317,14 @@ def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">; // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">; -// VOP3Mods, but only allowed for f32 operands. -def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">; def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; -def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">; def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; -def VOP3OpSel0 : ComplexPattern<untyped, 3, "SelectVOP3OpSel0">; def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; -def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">; def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; @@ -1347,6 +1385,7 @@ def HWREG { int FLAT_SCR_HI = 21; int XNACK_MASK = 22; int POPS_PACKER = 25; + int SHADER_CYCLES = 29; } class getHwRegImm<int Reg, int Offset = 0, int Size = 32> { @@ -1380,24 +1419,21 @@ class SIMCInstr <string pseudo, int subtarget> { // EXP classes //===----------------------------------------------------------------------===// -class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon< +class EXP_Helper<bit done> : EXPCommon< (outs), (ins exp_tgt:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3, - exp_vm:$vm, exp_compr:$compr, i8imm:$en), - "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", - [(node (i8 timm:$tgt), (i8 timm:$en), - f32:$src0, f32:$src1, f32:$src2, f32:$src3, - (i1 timm:$compr), (i1 timm:$vm))]> { + exp_vm:$vm, exp_compr:$compr, i32imm:$en), + "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", []> { let AsmMatchConverter = "cvtExp"; } // Split EXP instruction into EXP and EXP_DONE so we can set // mayLoad for done=1. -multiclass EXP_m<bit done, SDPatternOperator node> { +multiclass EXP_m<bit done> { let mayLoad = done, DisableWQM = 1 in { let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXP_Helper<done, node>, + def "" : EXP_Helper<done>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>; } @@ -1685,7 +1721,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (HasClamp, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, + clampmod0:$clamp, op_sel:$op_sel, op_sel_hi:$op_sel_hi, neg_lo:$neg_lo, neg_hi:$neg_hi), (ins Src0Mod:$src0_modifiers, Src0RC:$src0, @@ -1697,7 +1733,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, + clampmod0:$clamp, op_sel:$op_sel, op_sel_hi:$op_sel_hi, neg_lo:$neg_lo, neg_hi:$neg_hi), (ins Src0Mod:$src0_modifiers, Src0RC:$src0, @@ -1720,7 +1756,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, !if (HasClamp, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, - clampmod:$clamp, + clampmod0:$clamp, op_sel:$op_sel), (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, @@ -1730,7 +1766,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, Src2Mod:$src2_modifiers, Src2RC:$src2, - clampmod:$clamp, + clampmod0:$clamp, op_sel:$op_sel), (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, @@ -2242,6 +2278,7 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; +def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], 0, /*EnableClamp=*/1>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; @@ -2455,7 +2492,8 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.GFX80)], [!cast<string>(SIEncodingFamily.GFX9)], [!cast<string>(SIEncodingFamily.GFX10)], - [!cast<string>(SIEncodingFamily.SDWA10)]]; + [!cast<string>(SIEncodingFamily.SDWA10)], + [!cast<string>(SIEncodingFamily.GFX10_B)]]; } // Get equivalent SOPK instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d84720f820ee3..0c4c9e0e9df2b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1,4 +1,4 @@ -//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +//===-- SIInstructions.td - SI Instruction Definitions --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -24,8 +24,38 @@ include "BUFInstructions.td" // EXP Instructions //===----------------------------------------------------------------------===// -defm EXP : EXP_m<0, AMDGPUexport>; -defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; +defm EXP : EXP_m<0>; +defm EXP_DONE : EXP_m<1>; + +class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< + (int_amdgcn_exp timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), + done_val, timm:$vm), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) +>; + +class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< + (int_amdgcn_exp_compr timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + done_val, timm:$vm), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en) +>; + +// FIXME: The generated DAG matcher seems to have strange behavior +// with a 1-bit literal to match, so use a -1 for checking a true +// 1-bit value. +def : ExpPattern<i32, EXP, 0>; +def : ExpPattern<i32, EXP_DONE, -1>; +def : ExpPattern<f32, EXP, 0>; +def : ExpPattern<f32, EXP_DONE, -1>; + +def : ExpComprPattern<v2i16, EXP, 0>; +def : ExpComprPattern<v2i16, EXP_DONE, -1>; +def : ExpComprPattern<v2f16, EXP, 0>; +def : ExpComprPattern<v2f16, EXP_DONE, -1>; //===----------------------------------------------------------------------===// // VINTRP Instructions @@ -34,9 +64,9 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; // Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) def VINTRPDst : VINTRPDstOperand <VGPR_32>; -let Uses = [M0, EXEC] in { +let Uses = [MODE, M0, EXEC] in { -// FIXME: Specify SchedRW for VINTRP insturctions. +// FIXME: Specify SchedRW for VINTRP instructions. multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, @@ -76,10 +106,10 @@ defm V_INTERP_MOV_F32 : VINTRP_m < (outs VINTRPDst:$vdst), (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", - [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc), + [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; -} // End Uses = [M0, EXEC] +} // End Uses = [MODE, M0, EXEC] //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -136,7 +166,8 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { - let Defs = [EXEC]; + let Uses = [EXEC]; + let Defs = [EXEC, SCC]; let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; @@ -162,16 +193,27 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), let Constraints = "$src = $vdst"; } +let usesCustomInserter = 1, Defs = [VCC, EXEC] in { +def V_ADD_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))] +>; + +def V_SUB_U64_PSEUDO : VPseudoInstSI < + (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), + [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))] +>; +} // End usesCustomInserter = 1, Defs = [VCC, EXEC] let usesCustomInserter = 1, Defs = [SCC] in { def S_ADD_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] >; def S_SUB_U64_PSEUDO : SPseudoInstSI < - (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), - [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))] + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] >; def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < @@ -181,6 +223,23 @@ def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) >; + +def S_ADD_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_SUB_CO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) +>; + +def S_UADDO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + +def S_USUBO_PSEUDO : SPseudoInstSI < + (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) +>; + } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { @@ -199,6 +258,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< let hasSideEffects = base_inst.hasSideEffects; let UseNamedOperandTable = base_inst.UseNamedOperandTable; let CodeSize = base_inst.CodeSize; + let SchedRW = base_inst.SchedRW; } let WaveSizePredicate = isWave64 in { @@ -214,13 +274,14 @@ def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; } + def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), [(int_amdgcn_wave_barrier)]> { let SchedRW = []; let hasNoSchedulingInfo = 1; let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; let isConvergent = 1; let FixedSize = 1; let Size = 0; @@ -318,6 +379,9 @@ multiclass PseudoInstKill <dag ins> { defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; +let Defs = [EXEC] in +def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; + let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), @@ -386,7 +450,7 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < def : GCNPat < (int_amdgcn_init_exec timm:$src), - (SI_INIT_EXEC_LO (as_i32imm imm:$src))> { + (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { let WaveSizePredicate = isWave32; } @@ -413,8 +477,8 @@ def SI_RETURN : SPseudoInstSI < // Return for returning function calls without output register. // -// This version is only needed so we can fill in the output regiter in -// the custom inserter. +// This version is only needed so we can fill in the output register +// in the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < (outs), (ins SSrc_b64:$src0, unknown:$callee), [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { @@ -426,6 +490,11 @@ def SI_CALL_ISEL : SPseudoInstSI < let isConvergent = 1; } +def : GCNPat< + (AMDGPUcall i64:$src0, (i64 0)), + (SI_CALL_ISEL $src0, (i64 0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < @@ -480,6 +549,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let Defs = [M0, EXEC, SCC], UseNamedOperandTable = 1 in { +// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect +// addressing implementation. class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < (outs VGPR_32:$vdst), (ins rc:$src, VS_32:$idx, i32imm:$offset)> { @@ -493,21 +564,81 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < let usesCustomInserter = 1; } -// TODO: We can support indirect SGPR access. def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; +def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; +def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; } // End Uses = [EXEC], Defs = [M0, EXEC] + +// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32 +// expecting to be executed with gpr indexing mode enabled) +// instruction in which the vector operand appears only twice, once as +// def and once as use. Using this pseudo avoids problems with the Two +// Address instructions pass. +class INDIRECT_REG_WRITE_pseudo<RegisterClass rc, + RegisterOperand val_ty> : PseudoInstSI < + (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { + let Constraints = "$vsrc = $vdst"; + let Uses = [M0]; +} + +class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : + INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> { + let VALU = 1; + let VOP1 = 1; + let Uses = [M0, EXEC]; +} + +class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc, + RegisterOperand val_ty> : + INDIRECT_REG_WRITE_pseudo<rc, val_ty> { + let SALU = 1; + let SOP1 = 1; + let Uses = [M0]; +} + +class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>; +class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> : + S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>; + + +def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>; +def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>; +def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>; +def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>; +def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>; +def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>; +def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>; +def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>; + +def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>; +def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>; +def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>; +def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>; + +def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>; +def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>; +def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>; +def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>; + + multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { def _SAVE : PseudoInstSI < @@ -535,6 +666,7 @@ defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; +defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; @@ -574,6 +706,7 @@ defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; +defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; @@ -639,12 +772,6 @@ def : GCNPat< >; def : Pat < - // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) - (AMDGPUkill (i32 -1082130432)), - (SI_KILL_I1_PSEUDO (i1 0), 0) ->; - -def : Pat < (int_amdgcn_kill i1:$src), (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) >; @@ -655,11 +782,6 @@ def : Pat < >; def : Pat < - (AMDGPUkill i32:$src), - (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, 0, 3) // 3 means SETOGE ->; - -def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; @@ -693,14 +815,14 @@ def : RsqPat<V_RSQ_F64_e32, f64>; def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_FRACT_F32_e64 $mods, $x) >; // Convert (x + (-floor(x))) to fract(x) def : GCNPat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_FRACT_F64_e64 $mods, $x) >; } // End OtherPredicates = [UnsafeFPMath] @@ -709,27 +831,27 @@ def : GCNPat < // f16_to_fp patterns def : GCNPat < (f32 (f16_to_fp i32:$src0)), - (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0) >; def : GCNPat < (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), - (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0) >; def : GCNPat < (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), - (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) >; def : GCNPat < (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), - (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0) >; def : GCNPat < (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), - (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0) >; def : GCNPat < @@ -740,7 +862,7 @@ def : GCNPat < // fp_to_fp16 patterns def : GCNPat < (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), - (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0) >; def : GCNPat < @@ -767,20 +889,29 @@ def : GCNPat < // VOP2 Patterns //===----------------------------------------------------------------------===// -multiclass FMADPat <ValueType vt, Instruction inst> { - def : GCNPat < - (vt (fmad (VOP3NoMods vt:$src0), - (VOP3NoMods vt:$src1), - (VOP3NoMods vt:$src2))), +// TODO: Check only no src2 mods? +class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> + : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), + (vt (VOP3NoMods vt:$src1)), + (vt (VOP3NoMods vt:$src2)))), (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) - >; +>; + + +// Prefer mac form when there are no modifiers. +let AddedComplexity = 9 in { +def : FMADPat <f32, V_MAC_F32_e64, fmad>; +def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>; + +let SubtargetPredicate = Has16BitInsts in { +def : FMADPat <f16, V_MAC_F16_e64, fmad>; +def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>; } -defm : FMADPat <f16, V_MAC_F16_e64>; -defm : FMADPat <f32, V_MAC_F32_e64>; +} -class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty> +class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> : GCNPat< (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)), (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)), @@ -789,24 +920,28 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty> $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -// FIXME: This should select to V_MAC_F32 -def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>; -def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> { +let SubtargetPredicate = HasMadMacF32Insts in +def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>; +def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> { let SubtargetPredicate = Has16BitInsts; } -multiclass SelectPat <ValueType vt> { - def : GCNPat < - (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods), - (VOP3Mods_f32 vt:$src2, i32:$src2_mods))), - (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0) - >; -} +class VOPSelectModsPat <ValueType vt> : GCNPat < + (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), + (VOP3Mods vt:$src2, i32:$src2_mods))), + (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, + FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) +>; + +class VOPSelectPat <ValueType vt> : GCNPat < + (vt (select i1:$src0, vt:$src1, vt:$src2)), + (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) +>; -defm : SelectPat <i16>; -defm : SelectPat <i32>; -defm : SelectPat <f16>; -defm : SelectPat <f32>; +def : VOPSelectModsPat <i32>; +def : VOPSelectModsPat <f32>; +def : VOPSelectPat <f16>; +def : VOPSelectPat <i16>; let AddedComplexity = 1 in { def : GCNPat < @@ -1039,6 +1174,8 @@ def : BitConvert <v4f32, v2f64, VReg_128>; def : BitConvert <v4i32, v2f64, VReg_128>; def : BitConvert <v2i64, v2f64, VReg_128>; def : BitConvert <v2f64, v2i64, VReg_128>; +def : BitConvert <v4f32, v2i64, VReg_128>; +def : BitConvert <v2i64, v4f32, VReg_128>; // 160-bit bitcast def : BitConvert <v5i32, v5f32, SGPR_160>; @@ -1049,14 +1186,46 @@ def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, VReg_256>; def : BitConvert <v8f32, v8i32, VReg_256>; +def : BitConvert <v4i64, v4f64, VReg_256>; +def : BitConvert <v4f64, v4i64, VReg_256>; +def : BitConvert <v4i64, v8i32, VReg_256>; +def : BitConvert <v4i64, v8f32, VReg_256>; +def : BitConvert <v4f64, v8i32, VReg_256>; +def : BitConvert <v4f64, v8f32, VReg_256>; +def : BitConvert <v8i32, v4i64, VReg_256>; +def : BitConvert <v8f32, v4i64, VReg_256>; +def : BitConvert <v8i32, v4f64, VReg_256>; +def : BitConvert <v8f32, v4f64, VReg_256>; + // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; +def : BitConvert <v8i64, v8f64, VReg_512>; +def : BitConvert <v8f64, v8i64, VReg_512>; +def : BitConvert <v8i64, v16i32, VReg_512>; +def : BitConvert <v8f64, v16i32, VReg_512>; +def : BitConvert <v16i32, v8i64, VReg_512>; +def : BitConvert <v16i32, v8f64, VReg_512>; +def : BitConvert <v8i64, v16f32, VReg_512>; +def : BitConvert <v8f64, v16f32, VReg_512>; +def : BitConvert <v16f32, v8i64, VReg_512>; +def : BitConvert <v16f32, v8f64, VReg_512>; // 1024-bit bitcast def : BitConvert <v32i32, v32f32, VReg_1024>; def : BitConvert <v32f32, v32i32, VReg_1024>; +def : BitConvert <v16i64, v16f64, VReg_1024>; +def : BitConvert <v16f64, v16i64, VReg_1024>; +def : BitConvert <v16i64, v32i32, VReg_1024>; +def : BitConvert <v32i32, v16i64, VReg_1024>; +def : BitConvert <v16f64, v32f32, VReg_1024>; +def : BitConvert <v32f32, v16f64, VReg_1024>; +def : BitConvert <v16i64, v32f32, VReg_1024>; +def : BitConvert <v32i32, v16f64, VReg_1024>; +def : BitConvert <v16f64, v32i32, VReg_1024>; +def : BitConvert <v32f32, v16i64, VReg_1024>; + /********** =================== **********/ /********** Src & Dst modifiers **********/ @@ -1155,7 +1324,7 @@ def : GCNPat < (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; -// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled +// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled // def : GCNPat < // (fneg (f64 SReg_64:$src)), // (REG_SEQUENCE SReg_64, @@ -1176,6 +1345,17 @@ def : GCNPat < // sub1) // >; +// FIXME: Use S_BITSET0_B32/B64? +// def : GCNPat < +// (fabs (f64 SReg_64:$src)), +// (REG_SEQUENCE SReg_64, +// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), +// sub0, +// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), +// (i32 (S_MOV_B32 (i32 0x7fffffff)))), +// sub1) +// >; + } // End let AddedComplexity = 1 def : GCNPat < @@ -1372,11 +1552,12 @@ class Ext32Pat <SDNode ext> : GCNPat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -// The multiplication scales from [0,1] to the unsigned integer range +// The multiplication scales from [0,1) to the unsigned integer range, +// rounding down a bit to avoid unwanted overflow. def : GCNPat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), + (V_MUL_F32_e32 (i32 CONST.FP_4294966784), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -1421,11 +1602,13 @@ defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; +defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; +defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; //===----------------------------------------------------------------------===// // SAD Patterns @@ -1695,102 +1878,187 @@ def : GCNPat < def : GCNPat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), - (V_ALIGNBIT_B32 $a, $a, (i32 24)), - (V_ALIGNBIT_B32 $a, $a, (i32 8))) + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), + (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) >; -let OtherPredicates = [NoFP16Denormals] in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) +// FIXME: This should have been narrowed to i32 during legalization. +// This pattern should also be skipped for GlobalISel +def : GCNPat < + (i64 (bswap i64:$a)), + (REG_SEQUENCE VReg_64, + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 24)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), + (i32 8))), + sub0, + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 24)), + (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), + (i32 8))), + sub1) +>; + +// FIXME: The AddedComplexity should not be needed, but in GlobalISel +// the BFI pattern ends up taking precedence without it. +let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { +// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) +// +// My reading of the manual suggests we should be using src0 for the +// register value, but this is what seems to work. +def : GCNPat < + (i32 (bswap i32:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) >; -def : GCNPat< - (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), - (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0) +// FIXME: This should have been narrowed to i32 during legalization. +// This pattern should also be skipped for GlobalISel +def : GCNPat < + (i64 (bswap i64:$a)), + (REG_SEQUENCE VReg_64, + (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), + (S_MOV_B32 (i32 0x00010203))), + sub0, + (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), + (S_MOV_B32 (i32 0x00010203))), + sub1) >; -def : GCNPat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) +// The 12s emit 0s. +def : GCNPat < + (i16 (bswap i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; -} -let OtherPredicates = [FP16Denormals] in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) +def : GCNPat < + (i32 (zext (bswap i16:$a))), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) >; -let SubtargetPredicate = HasVOP3PInsts in { -def : GCNPat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) +// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) +def : GCNPat < + (v2i16 (bswap v2i16:$a)), + (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) >; + } -} -let OtherPredicates = [NoFP32Denormals] in { + +// Prefer selecting to max when legal, but using mul is always valid. +let AddedComplexity = -5 in { def : GCNPat< - (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), - (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) >; def : GCNPat< - (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), - (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0) + (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) +>; + +def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; -} -let OtherPredicates = [FP32Denormals] in { def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), - (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0) + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) >; -} -let OtherPredicates = [NoFP64Denormals] in { def : GCNPat< - (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) + (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) >; -} -let OtherPredicates = [FP64Denormals] in { +// TODO: Handle fneg like other types. def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0) + (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src) >; +} // End AddedComplexity = -5 + +multiclass SelectCanonicalizeAsMax< + list<Predicate> f32_preds = [], + list<Predicate> f64_preds = [], + list<Predicate> f16_preds = []> { + def : GCNPat< + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f32_preds; + } + + def : GCNPat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_F64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f64_preds; + } + + def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { + // FIXME: Should have 16-bit inst subtarget predicate + let OtherPredicates = f16_preds; + } + + def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { + // FIXME: Should have VOP3P subtarget predicate + let OtherPredicates = f16_preds; + } } +// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal +// mode, and would never flush. For f64, it's faster to do implement +// this with a max. For f16/f32 it's a wash, but prefer max when +// valid. +// +// FIXME: Lowering f32/f16 with max is worse since we can use a +// smaller encoding if the input is fneg'd. It also adds an extra +// register use. +let SubtargetPredicate = HasMinMaxDenormModes in { + defm : SelectCanonicalizeAsMax<[], [], []>; +} // End SubtargetPredicate = HasMinMaxDenormModes + +let SubtargetPredicate = NotHasMinMaxDenormModes in { + // Use the max lowering if we don't need to flush. + + // FIXME: We don't do use this for f32 as a workaround for the + // library being compiled with the default ieee mode, but + // potentially being called from flushing kernels. Really we should + // not be mixing code expecting different default FP modes, but mul + // works in any FP environment. + defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; +} // End SubtargetPredicate = NotHasMinMaxDenormModes + + let OtherPredicates = [HasDLInsts] in { def : GCNPat < - (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), (f32 (VOP3NoMods f32:$src2))), (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - SRCMODS.NONE, $src2, $clamp, $omod) + SRCMODS.NONE, $src2) >; } // End OtherPredicates = [HasDLInsts] let SubtargetPredicate = isGFX10Plus in def : GCNPat < - (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)), (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), (f16 (VOP3NoMods f32:$src2))), (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, - SRCMODS.NONE, $src2, $clamp, $omod) ->; - -// Allow integer inputs -class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< - (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), - (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en) + SRCMODS.NONE, $src2) >; -def : ExpPattern<AMDGPUexport, i32, EXP>; -def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; - // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1873,19 +2141,20 @@ def : GCNPat < >; def : GCNPat < - (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, - timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), + (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src, + (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), + (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; def : GCNPat < (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; //===----------------------------------------------------------------------===// @@ -1901,6 +2170,11 @@ let SubtargetPredicate = isGFX6 in { // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) + +// Don't bother handling this for GlobalISel, it's handled during +// lowering. +// +// FIXME: DAG should also custom lower this. def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 @@ -1910,13 +2184,11 @@ def : GCNPat < (V_CNDMASK_B64_PSEUDO (V_MIN_F64 SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + (V_FRACT_F64_e64 $mods, $x), SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), + (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), - DSTCLAMP.NONE, DSTOMOD.NONE) + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) >; } // End SubtargetPredicates = isGFX6 @@ -2061,13 +2333,164 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} + +class BufferLoadGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; +} + +class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; +} + +def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; +def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; +def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; +def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; + +class BufferStoreGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayStore = 1; +} + +class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$format, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayStore = 1; +} + +def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; +def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; +def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; +def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; + +def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +foreach N = 0-3 in { +def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$oldval); - let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval); + let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +let Namespace = "AMDGPU" in { +def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; +def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; +} + +class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, + type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 1; } + +def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; + +def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, + type2:$voffset, type2:$soffset, untyped_imm_0:$offset, + untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 1; +} + +// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as +// a workaround for the intrinsic being defined as readnone, but +// really needs a memory operand. +def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} + +// This is equivalent to the G_INTRINSIC*, but the operands may have +// been legalized depending on the subtarget requirements. +def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + + // FIXME: Use separate opcode for atomics. + let mayStore = 1; +} + +// This is equivalent to the G_INTRINSIC*, but the operands may have +// been legalized depending on the subtarget requirements. +def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayStore = 1; +} diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index d2b1abc8a9fb8..2eb1c52f1b595 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -103,15 +103,19 @@ enum InstClassEnum { TBUFFER_STORE, }; -enum RegisterEnum { - SBASE = 0x1, - SRSRC = 0x2, - SOFFSET = 0x4, - VADDR = 0x8, - ADDR = 0x10, - SSAMP = 0x20, +struct AddressRegs { + unsigned char NumVAddrs = 0; + bool SBase = false; + bool SRsrc = false; + bool SOffset = false; + bool VAddr = false; + bool Addr = false; + bool SSamp = false; }; +// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. +const unsigned MaxAddressRegs = 12 + 1 + 1; + class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; @@ -126,10 +130,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool SLC; bool DLC; bool UseST64; - SmallVector<MachineInstr *, 8> InstsToMove; - int AddrIdx[5]; - const MachineOperand *AddrReg[5]; + int AddrIdx[MaxAddressRegs]; + const MachineOperand *AddrReg[MaxAddressRegs]; unsigned NumAddresses; + unsigned Order; bool hasSameBaseAddress(const MachineInstr &MI) { for (unsigned i = 0; i < NumAddresses; i++) { @@ -183,8 +187,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { }; struct BaseRegisters { - unsigned LoReg = 0; - unsigned HiReg = 0; + Register LoReg; + Register HiReg; unsigned LoSubReg = 0; unsigned HiSubReg = 0; @@ -201,7 +205,6 @@ private: const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; - const MCSubtargetInfo *STI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; bool OptimizeAgain; @@ -209,9 +212,9 @@ private: static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII, const CombineInfo &Paired); - static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, - CombineInfo &Paired); - static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI, + static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, + CombineInfo &Paired, bool Modify = false); + static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, const CombineInfo &Paired); static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, @@ -219,25 +222,42 @@ private: const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired); - bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired); + bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, + SmallVectorImpl<MachineInstr *> &InstsToMove); unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired); + MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, + CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; - MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired); - MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired); - - void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, + MachineBasicBlock::iterator + mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeImagePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + MachineBasicBlock::iterator + mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove); + + void updateBaseAndOffset(MachineInstr &I, Register NewBase, int32_t NewOffset) const; - unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; + Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; @@ -249,8 +269,11 @@ private: SmallPtrSet<MachineInstr *, 4> &Promoted) const; void addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const; - bool collectMergeableInsts(MachineBasicBlock &MBB, - std::list<std::list<CombineInfo> > &MergeableInsts) const; + + std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, + std::list<std::list<CombineInfo>> &MergeableInsts) const; public: static char ID; @@ -259,8 +282,6 @@ public: initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - void removeCombinedInst(std::list<CombineInfo> &MergeList, - const MachineInstr &MI); bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, bool &OptimizeListAgain); bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); @@ -275,6 +296,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA); + } }; static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { @@ -327,7 +353,8 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { } if (TII.isMIMG(Opc)) { // Ignore instructions encoded without vaddr. - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || @@ -400,58 +427,54 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { } } -static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { - if (TII.isMUBUF(Opc)) { - unsigned result = 0; +static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { + AddressRegs Result; - if (AMDGPU::getMUBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMUBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMUBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; + if (TII.isMUBUF(Opc)) { + if (AMDGPU::getMUBUFHasVAddr(Opc)) + Result.VAddr = true; + if (AMDGPU::getMUBUFHasSrsrc(Opc)) + Result.SRsrc = true; + if (AMDGPU::getMUBUFHasSoffset(Opc)) + Result.SOffset = true; + + return Result; } if (TII.isMIMG(Opc)) { - unsigned result = VADDR | SRSRC; + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx >= 0) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + Result.NumVAddrs = SRsrcIdx - VAddr0Idx; + } else { + Result.VAddr = true; + } + Result.SRsrc = true; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) - result |= SSAMP; + Result.SSamp = true; - return result; + return Result; } if (TII.isMTBUF(Opc)) { - unsigned result = 0; - - if (AMDGPU::getMTBUFHasVAddr(Opc)) { - result |= VADDR; - } - - if (AMDGPU::getMTBUFHasSrsrc(Opc)) { - result |= SRSRC; - } - - if (AMDGPU::getMTBUFHasSoffset(Opc)) { - result |= SOFFSET; - } - - return result; + if (AMDGPU::getMTBUFHasVAddr(Opc)) + Result.VAddr = true; + if (AMDGPU::getMTBUFHasSrsrc(Opc)) + Result.SRsrc = true; + if (AMDGPU::getMTBUFHasSoffset(Opc)) + Result.SOffset = true; + + return Result; } switch (Opc) { default: - return 0; + return Result; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - return SBASE; + Result.SBase = true; + return Result; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B64: case AMDGPU::DS_READ_B32_gfx9: @@ -460,7 +483,8 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B32_gfx9: case AMDGPU::DS_WRITE_B64_gfx9: - return ADDR; + Result.Addr = true; + return Result; } } @@ -486,7 +510,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, : 4; break; case S_BUFFER_LOAD_IMM: - EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4); + EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); break; default: EltSize = 4; @@ -495,6 +519,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, if (InstClass == MIMG) { DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); + // Offset is not considered for MIMG instructions. + Offset = 0; } else { int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); Offset = I->getOperand(OffsetIdx).getImm(); @@ -515,40 +541,34 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); } - unsigned AddrOpName[5] = {0}; - NumAddresses = 0; - const unsigned Regs = getRegs(I->getOpcode(), TII); - - if (Regs & ADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - } + AddressRegs Regs = getRegs(Opc, TII); - if (Regs & SBASE) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - } - - if (Regs & SRSRC) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - } - - if (Regs & SOFFSET) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - } - - if (Regs & VADDR) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - } - - if (Regs & SSAMP) { - AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; - } - - for (unsigned i = 0; i < NumAddresses; i++) { - AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); - AddrReg[i] = &I->getOperand(AddrIdx[i]); - } - - InstsToMove.clear(); + NumAddresses = 0; + for (unsigned J = 0; J < Regs.NumVAddrs; J++) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; + if (Regs.Addr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); + if (Regs.SBase) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); + if (Regs.SRsrc) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + if (Regs.SOffset) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); + if (Regs.VAddr) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); + if (Regs.SSamp) + AddrIdx[NumAddresses++] = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); + assert(NumAddresses <= MaxAddressRegs); + + for (unsigned J = 0; J < NumAddresses; J++) + AddrReg[J] = &I->getOperand(AddrIdx[J]); } } // end anonymous namespace. @@ -578,8 +598,8 @@ static void moveInstsAfter(MachineBasicBlock::iterator I, } static void addDefsUsesToList(const MachineInstr &MI, - DenseSet<unsigned> &RegDefs, - DenseSet<unsigned> &PhysRegUses) { + DenseSet<Register> &RegDefs, + DenseSet<Register> &PhysRegUses) { for (const MachineOperand &Op : MI.operands()) { if (Op.isReg()) { if (Op.isDef()) @@ -601,8 +621,8 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, - DenseSet<unsigned> &PhysRegUses, +static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, + DenseSet<Register> &PhysRegUses, SmallVectorImpl<MachineInstr *> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the @@ -671,7 +691,8 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, // Check other optional immediate operands for equality. unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, AMDGPU::OpName::d16, AMDGPU::OpName::unorm, - AMDGPU::OpName::da, AMDGPU::OpName::r128}; + AMDGPU::OpName::da, AMDGPU::OpName::r128, + AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; for (auto op : OperandsToMatch) { int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); @@ -695,7 +716,7 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, - const MCSubtargetInfo &STI) { + const GCNSubtarget &STI) { if (ComponentCount > 4) return 0; @@ -719,8 +740,9 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat, } bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, - const MCSubtargetInfo &STI, - CombineInfo &Paired) { + const GCNSubtarget &STI, + CombineInfo &Paired, + bool Modify) { assert(CI.InstClass != MIMG); // XXX - Would the same offset be OK? Is there any reason this would happen or @@ -761,7 +783,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, CI.UseST64 = false; CI.BaseOff = 0; - // Handle SMEM and VMEM instructions. + // Handle DS instructions. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && @@ -769,20 +791,25 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); } + // Handle SMEM and VMEM instructions. // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { - CI.Offset = EltOffset0 / 64; - Paired.Offset = EltOffset1 / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = EltOffset0 / 64; + Paired.Offset = EltOffset1 / 64; + CI.UseST64 = true; + } return true; } // Check if the new offsets fit in the reduced 8-bit range. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { - CI.Offset = EltOffset0; - Paired.Offset = EltOffset1; + if (Modify) { + CI.Offset = EltOffset0; + Paired.Offset = EltOffset1; + } return true; } @@ -791,15 +818,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, CI.BaseOff = std::min(CI.Offset, Paired.Offset); if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { - CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + } return true; } if (isUInt<8>(OffsetDiff)) { - CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; - Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + if (Modify) { + CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; + Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + } return true; } @@ -824,11 +855,19 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, } } -bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, - CombineInfo &Paired) { - MachineBasicBlock *MBB = CI.I->getParent(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator MBBI = CI.I; +/// This function assumes that CI comes before Paired in a basic block. +bool SILoadStoreOptimizer::checkAndPrepareMerge( + CombineInfo &CI, CombineInfo &Paired, + SmallVectorImpl<MachineInstr *> &InstsToMove) { + + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. + if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) + return false; + + if (CI.InstClass != MIMG && + (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) + return false; const unsigned Opc = CI.I->getOpcode(); const InstClassEnum InstClass = getInstClass(Opc, *TII); @@ -844,14 +883,25 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) return false; - ++MBBI; - - DenseSet<unsigned> RegDefsToMove; - DenseSet<unsigned> PhysRegUsesToMove; + DenseSet<Register> RegDefsToMove; + DenseSet<Register> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); + MachineBasicBlock::iterator E = std::next(Paired.I); + MachineBasicBlock::iterator MBBI = std::next(CI.I); + MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); for (; MBBI != E; ++MBBI) { + if (MBBI == MBBE) { + // CombineInfo::Order is a hint on the instruction ordering within the + // basic block. This hint suggests that CI precedes Paired, which is + // true most of the time. However, moveInstsAfter() processing a + // previous list may have changed this order in a situation when it + // moves an instruction which exists in some other merge list. + // In this case it must be dependent. + return false; + } + if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { // This is not a matching instruction, but we can keep looking as @@ -868,11 +918,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. - CI.InstsToMove.push_back(&*MBBI); + InstsToMove.push_back(&*MBBI); addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } @@ -881,7 +931,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, // to the location of the matched instruction any uses of I will need to // be moved down as well. addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove); + InstsToMove); continue; } @@ -901,26 +951,24 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove)) + InstsToMove)) continue; - bool Match = CI.hasSameBaseAddress(*MBBI); - - if (Match) { - Paired.setMI(MBBI, *TII, *STM); - - // Check both offsets (or masks for MIMG) can be combined and fit in the - // reduced range. - bool canBeCombined = - CI.InstClass == MIMG - ? dmasksCanBeCombined(CI, *TII, Paired) - : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired); - - // We also need to go through the list of instructions that we plan to + if (&*MBBI == &*Paired.I) { + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { + + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + if (CI.InstClass != MIMG) + offsetsCanBeCombined(CI, *STM, Paired, true); return true; + } + return false; } // We've found a load/store that we couldn't merge for some reason. @@ -929,7 +977,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) break; } return false; @@ -950,7 +998,8 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -1023,7 +1072,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) { .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1049,7 +1098,8 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -1106,7 +1156,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) { .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - moveInstsAfter(Write2, CI.InstsToMove); + moveInstsAfter(Write2, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1116,7 +1166,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) { } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) { +SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1161,15 +1212,16 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) { .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1211,15 +1263,16 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Pair .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1233,9 +1286,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); // It shouldn't be possible to get this far if the two instructions @@ -1273,15 +1326,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1295,13 +1349,13 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); unsigned JoinedFormat = - getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI); + getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); // It shouldn't be possible to get this far if the two instructions // don't have a single memoperand, because MachineInstr::mayAlias() @@ -1340,15 +1394,16 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) .add(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - moveInstsAfter(Copy1, CI.InstsToMove); + moveInstsAfter(Copy1, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); return New; } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1374,13 +1429,13 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); unsigned JoinedFormat = - getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI); + getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); // It shouldn't be possible to get this far if the two instructions // don't have a single memoperand, because MachineInstr::mayAlias() @@ -1403,7 +1458,7 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, CI.InstsToMove); + moveInstsAfter(MIB, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1491,9 +1546,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, case 4: return &AMDGPU::SGPR_128RegClass; case 8: - return &AMDGPU::SReg_256RegClass; + return &AMDGPU::SGPR_256RegClass; case 16: - return &AMDGPU::SReg_512RegClass; + return &AMDGPU::SGPR_512RegClass; } } else { switch (CI.Width + Paired.Width) { @@ -1509,8 +1564,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, } } -MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) { +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( + CombineInfo &CI, CombineInfo &Paired, + const SmallVectorImpl<MachineInstr *> &InstsToMove) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1536,9 +1592,9 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); - const unsigned Regs = getRegs(Opcode, *TII); + AddressRegs Regs = getRegs(Opcode, *TII); - if (Regs & VADDR) + if (Regs.VAddr) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); @@ -1561,7 +1617,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); - moveInstsAfter(MIB, CI.InstsToMove); + moveInstsAfter(MIB, InstsToMove); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1585,7 +1641,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { } // Compute base address using Addr and return the final register. -unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, +Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, const MemAddress &Addr) const { MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::iterator MBBI = MI.getIterator(); @@ -1644,7 +1700,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, // Update base and offset with the NewBase and NewOffset in MI. void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, - unsigned NewBase, + Register NewBase, int32_t NewOffset) const { auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); Base->setReg(NewBase); @@ -1856,7 +1912,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( << AnchorAddr.Offset << "\n\n"); // Instead of moving up, just re-compute anchor-instruction's base address. - unsigned Base = computeBase(MI, AnchorAddr); + Register Base = computeBase(MI, AnchorAddr); updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); @@ -1894,39 +1950,80 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, MergeableInsts.emplace_back(1, CI); } -bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, - std::list<std::list<CombineInfo> > &MergeableInsts) const { +std::pair<MachineBasicBlock::iterator, bool> +SILoadStoreOptimizer::collectMergeableInsts( + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, + std::list<std::list<CombineInfo>> &MergeableInsts) const { bool Modified = false; - // Contain the list - MemInfoMap Visited; - // Contains the list of instructions for which constant offsets are being - // promoted to the IMM. - SmallPtrSet<MachineInstr *, 4> AnchorList; // Sort potential mergeable instructions into lists. One list per base address. - for (MachineInstr &MI : MBB.instrs()) { + unsigned Order = 0; + MachineBasicBlock::iterator BlockI = Begin; + for (; BlockI != End; ++BlockI) { + MachineInstr &MI = *BlockI; + // We run this before checking if an address is mergeable, because it can produce // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + // Don't combine if volatile. We also won't be able to merge across this, so + // break the search. We can look after this barrier for separate merges. + if (MI.hasOrderedMemoryRef()) { + LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); + + // Search will resume after this instruction in a separate merge list. + ++BlockI; + break; + } + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); if (InstClass == UNKNOWN) continue; - // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) - continue; - CombineInfo CI; CI.setMI(MI, *TII, *STM); + CI.Order = Order++; if (!CI.hasMergeableAddress(*MRI)) continue; + LLVM_DEBUG(dbgs() << "Mergeable: " << MI); + addInstToMergeableList(CI, MergeableInsts); } - return Modified; + + // At this point we have lists of Mergeable instructions. + // + // Part 2: Sort lists by offset and then for each CombineInfo object in the + // list try to find an instruction that can be merged with I. If an instruction + // is found, it is stored in the Paired field. If no instructions are found, then + // the CombineInfo object is deleted from the list. + + for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + + std::list<CombineInfo> &MergeList = *I; + if (MergeList.size() <= 1) { + // This means we have found only one instruction with a given address + // that can be merged, and we need at least 2 instructions to do a merge, + // so this list can be discarded. + I = MergeableInsts.erase(I); + continue; + } + + // Sort the lists by offsets, this way mergeable instructions will be + // adjacent to each other in the list, which will make it easier to find + // matches. + MergeList.sort( + [] (const CombineInfo &A, CombineInfo &B) { + return A.Offset < B.Offset; + }); + ++I; + } + + return std::make_pair(BlockI, Modified); } // Scan through looking for adjacent LDS operations with constant offsets from @@ -1936,117 +2033,126 @@ bool SILoadStoreOptimizer::optimizeBlock( std::list<std::list<CombineInfo> > &MergeableInsts) { bool Modified = false; - for (std::list<CombineInfo> &MergeList : MergeableInsts) { - if (MergeList.size() < 2) - continue; + for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + std::list<CombineInfo> &MergeList = *I; bool OptimizeListAgain = false; if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { - // We weren't able to make any changes, so clear the list so we don't + // We weren't able to make any changes, so delete the list so we don't // process the same instructions the next time we try to optimize this // block. - MergeList.clear(); + I = MergeableInsts.erase(I); continue; } - // We made changes, but also determined that there were no more optimization - // opportunities, so we don't need to reprocess the list - if (!OptimizeListAgain) - MergeList.clear(); - - OptimizeAgain |= OptimizeListAgain; Modified = true; - } - return Modified; -} -void -SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList, - const MachineInstr &MI) { - - for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { - if (&*CI->I == &MI) { - MergeList.erase(CI); - return; + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) { + I = MergeableInsts.erase(I); + continue; } + OptimizeAgain = true; } + return Modified; } bool SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( std::list<CombineInfo> &MergeList, bool &OptimizeListAgain) { + if (MergeList.empty()) + return false; + bool Modified = false; - for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { - CombineInfo &CI = *I; - CombineInfo Paired; - if (CI.InstClass == UNKNOWN) - continue; + for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); + Next = std::next(I)) { + + auto First = I; + auto Second = Next; + + if ((*First).Order > (*Second).Order) + std::swap(First, Second); + CombineInfo &CI = *First; + CombineInfo &Paired = *Second; - if (!findMatchingInst(CI, Paired)) - goto done; + SmallVector<MachineInstr *, 8> InstsToMove; + if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { + ++I; + continue; + } Modified = true; - removeCombinedInst(MergeList, *Paired.I); + + LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); switch (CI.InstClass) { default: llvm_unreachable("unknown InstClass"); break; case DS_READ: { - MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeRead2Pair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); break; } case DS_WRITE: { - MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeWrite2Pair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); break; } case S_BUFFER_LOAD_IMM: { - MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeSBufferLoadImmPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 16; break; } case BUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeBufferLoadPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case BUFFER_STORE: { - MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeBufferStorePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case MIMG: { - MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeImagePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_LOAD: { - MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeTBufferLoadPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_STORE: { - MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired); + MachineBasicBlock::iterator NewMI = + mergeTBufferStorePair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } } + CI.Order = Paired.Order; + if (I == Second) + I = Next; -done: - // Clear the InstsToMove after we have finished searching so we don't have - // stale values left over if we search for this CI again in another pass - // over the block. - CI.InstsToMove.clear(); + MergeList.erase(Second); } return Modified; @@ -2062,26 +2168,41 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); - STI = &MF.getSubtarget<MCSubtargetInfo>(); MRI = &MF.getRegInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - assert(MRI->isSSA() && "Must be run on SSA"); - LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. This is tracked for an entire block at time. + SmallPtrSet<MachineInstr *, 4> AnchorList; + MemInfoMap Visited; for (MachineBasicBlock &MBB : MF) { - std::list<std::list<CombineInfo> > MergeableInsts; - // First pass: Collect list of all instructions we know how to merge. - Modified |= collectMergeableInsts(MBB, MergeableInsts); - do { - OptimizeAgain = false; - Modified |= optimizeBlock(MergeableInsts); - } while (OptimizeAgain); + MachineBasicBlock::iterator SectionEnd; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; + I = SectionEnd) { + bool CollectModified; + std::list<std::list<CombineInfo>> MergeableInsts; + + // First pass: Collect list of all instructions we know how to merge in a + // subset of the block. + std::tie(SectionEnd, CollectModified) = + collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); + + Modified |= CollectModified; + + do { + OptimizeAgain = false; + Modified |= optimizeBlock(MergeableInsts); + } while (OptimizeAgain); + } + + Visited.clear(); + AnchorList.clear(); } return Modified; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 61d2719a3aad6..36d52ac3ee891 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -38,8 +38,8 @@ /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch /// /// label0: -/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block -/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask +/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then block +/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask /// S_BRANCH_EXECZ label1 // Use our branch optimization /// // instruction again. /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block @@ -51,6 +51,8 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -73,6 +75,10 @@ using namespace llvm; #define DEBUG_TYPE "si-lower-control-flow" +static cl::opt<bool> +RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", + cl::init(true), cl::ReallyHidden); + namespace { class SILowerControlFlow : public MachineFunctionPass { @@ -81,8 +87,12 @@ private: const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + SetVector<MachineInstr*> LoweredEndCf; + DenseSet<Register> LoweredIf; + SmallSet<MachineInstr *, 16> NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; + bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -98,13 +108,23 @@ private: void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); - Register getSaveExec(MachineInstr* MI); - void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; void combineMasks(MachineInstr &MI); + void process(MachineInstr &MI); + + // Skip to the next instruction, ignoring debug instructions, and trivial + // block boundaries (blocks that have one (typically fallthrough) successor, + // and the successor has one predecessor. + MachineBasicBlock::iterator + skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + // Remove redundant SI_END_CF instructions. + void optimizeEndCf(); + public: static char ID; @@ -144,62 +164,44 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, - const SIInstrInfo *TII) { - Register SaveExecReg = MI.getOperand(0).getReg(); - auto U = MRI->use_instr_nodbg_begin(SaveExecReg); - - if (U == MRI->use_instr_nodbg_end() || - std::next(U) != MRI->use_instr_nodbg_end() || - U->getOpcode() != AMDGPU::SI_END_CF) - return false; - - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplififcations are not safe. - auto SMBB = MI.getParent(); - auto EMBB = U->getParent(); +static bool hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End, const SIInstrInfo *TII) { DenseSet<const MachineBasicBlock*> Visited; - SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(), - SMBB->succ_end()); + SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(), + Begin->succ_end()); while (!Worklist.empty()) { MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (MBB == EMBB || !Visited.insert(MBB).second) + if (MBB == End || !Visited.insert(MBB).second) continue; - for(auto &Term : MBB->terminators()) + for (auto &Term : MBB->terminators()) if (TII->isKillTerminator(Term.getOpcode())) - return false; + return true; Worklist.append(MBB->succ_begin(), MBB->succ_end()); } - return true; + return false; } -Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &SaveExec = MI->getOperand(0); - assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); - - Register SaveExecReg = SaveExec.getReg(); - unsigned FalseTermOpc = - TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; - MachineBasicBlock::iterator I = (MI); - MachineBasicBlock::iterator J = std::next(I); - if (J != MBB->end() && J->getOpcode() == FalseTermOpc && - J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { - SaveExecReg = J->getOperand(0).getReg(); - J->eraseFromParent(); - } - return SaveExecReg; +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { + Register SaveExecReg = MI.getOperand(0).getReg(); + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + + if (U == MRI->use_instr_nodbg_end() || + std::next(U) != MRI->use_instr_nodbg_end() || + U->getOpcode() != AMDGPU::SI_END_CF) + return false; + + return true; } void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - Register SaveExecReg = getSaveExec(&MI); + Register SaveExecReg = MI.getOperand(0).getReg(); MachineOperand& Cond = MI.getOperand(1); assert(Cond.getSubReg() == AMDGPU::NoSubRegister); @@ -209,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI, TII); + bool SimpleIf = isSimpleIf(MI, MRI); + + if (InsertKillCleanups) { + // Check for SI_KILL_*_TERMINATOR on full path of control flow and + // flag the associated SI_END_CF for insertion of a kill cleanup. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { + assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); + assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); + MachineOperand &NextExec = UseMI->getOperand(0); + Register NextExecReg = NextExec.getReg(); + if (NextExec.isDead()) { + assert(!SimpleIf); + break; + } + UseMI = MRI->use_instr_nodbg_begin(NextExecReg); + } + if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { + if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { + NeedsKillCleanup.insert(&*UseMI); + SimpleIf = false; + } + } + } else if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + } // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. @@ -219,6 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(Exec) .addReg(Exec, RegState::ImplicitDefine); + LoweredIf.insert(CopyReg); Register Tmp = MRI->createVirtualRegister(BoolRC); @@ -282,7 +313,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = getSaveExec(&MI); + Register DstReg = MI.getOperand(0).getReg(); bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); @@ -354,7 +385,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - auto Dst = getSaveExec(&MI); + auto Dst = MI.getOperand(0).getReg(); // Skip ANDing with exec if the break condition is already masked by exec // because it is a V_CMP in the same basic block. (We know the break @@ -416,6 +447,38 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MI.eraseFromParent(); } +MachineBasicBlock::iterator +SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + + SmallSet<const MachineBasicBlock *, 4> Visited; + MachineBasicBlock *B = &MBB; + do { + if (!Visited.insert(B).second) + return MBB.end(); + + auto E = B->end(); + for ( ; It != E; ++It) { + if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) + continue; + if (TII->mayReadEXEC(*MRI, *It)) + break; + } + + if (It != E) + return It; + + if (B->succ_size() != 1) + return MBB.end(); + + // If there is one trivial successor, advance to the next block. + MachineBasicBlock *Succ = *B->succ_begin(); + + It = Succ->begin(); + B = Succ; + } while (true); +} + void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -430,8 +493,20 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { .addReg(Exec) .add(MI.getOperand(0)); - if (LIS) + LoweredEndCf.insert(NewMI); + + // If this ends control flow which contains kills (as flagged in emitIf) + // then insert an SI_KILL_CLEANUP immediately following the exec mask + // manipulation. This can be lowered to early termination if appropriate. + MachineInstr *CleanUpMI = nullptr; + if (NeedsKillCleanup.count(&MI)) + CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); + + if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + if (CleanUpMI) + LIS->InsertMachineInstrInMaps(*CleanUpMI); + } MI.eraseFromParent(); @@ -494,6 +569,84 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } +void SILowerControlFlow::optimizeEndCf() { + // If the only instruction immediately following this END_CF is an another + // END_CF in the only successor we can avoid emitting exec mask restore here. + if (!RemoveRedundantEndcf) + return; + + for (MachineInstr *MI : LoweredEndCf) { + MachineBasicBlock &MBB = *MI->getParent(); + auto Next = + skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); + if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) + continue; + // Only skip inner END_CF if outer ENDCF belongs to SI_IF. + // If that belongs to SI_ELSE then saved mask has an inverted value. + Register SavedExec + = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); + assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); + + const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); + if (Def && LoweredIf.count(SavedExec)) { + LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } + } +} + +void SILowerControlFlow::process(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::iterator I(MI); + MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; + + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + default: + assert(false && "Attempt to process unsupported instruction"); + break; + } + + MachineBasicBlock::iterator Next; + for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MaskMI = *I; + switch (MaskMI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + // Cleanup bit manipulations on exec mask + combineMasks(MaskMI); + break; + default: + I = MBB.end(); + break; + } + } +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -503,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable<LiveIntervals>(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); + InsertKillCleanups = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -524,57 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Exec = AMDGPU::EXEC; } + SmallVector<MachineInstr *, 32> Worklist; + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next, Last; - - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; switch (MI.getOpcode()) { case AMDGPU::SI_IF: - emitIf(MI); + process(MI); break; case AMDGPU::SI_ELSE: - emitElse(MI); - break; - case AMDGPU::SI_IF_BREAK: - emitIfBreak(MI); - break; - case AMDGPU::SI_LOOP: - emitLoop(MI); - break; - case AMDGPU::SI_END_CF: - emitEndCf(MI); + // Only build worklist if SI_IF instructions must be processed first. + if (InsertKillCleanups) + Worklist.push_back(&MI); + else + process(MI); break; - case AMDGPU::S_AND_B64: - case AMDGPU::S_OR_B64: - case AMDGPU::S_AND_B32: - case AMDGPU::S_OR_B32: - // Cleanup bit manipulations on exec mask - combineMasks(MI); - Last = I; - continue; - default: - Last = I; - continue; + break; } - - // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; } } + for (MachineInstr *MI : Worklist) + process(*MI); + + optimizeEndCf(); + + LoweredEndCf.clear(); + LoweredIf.clear(); + NeedsKillCleanup.clear(); + return true; } diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 1d45e6241d225..236a24a02ece0 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -452,6 +452,11 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { /// all others, because phi lowering looks through copies and can therefore /// often make copy lowering unnecessary. bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { + // Only need to run this in SelectionDAG path. + if (TheMF.getProperties().hasProperty( + MachineFunctionProperties::Property::Selected)) + return false; + MF = &TheMF; MRI = &MF->getRegInfo(); DT = &getAnalysis<MachineDominatorTree>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 57ccf7641666b..1349d3b6bf3f6 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -100,7 +100,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, unsigned Reg = CS.getReg(); MachineInstrSpan MIS(I, &SaveBlock); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MVT::i32); TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, TRI); @@ -118,7 +119,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock, /// Insert restore code for the callee-saved registers used in the function. static void insertCSRRestores(MachineBasicBlock &RestoreBlock, - std::vector<CalleeSavedInfo> &CSI, + MutableArrayRef<CalleeSavedInfo> CSI, LiveIntervals *LIS) { MachineFunction &MF = *RestoreBlock.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -133,7 +134,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CI : reverse(CSI)) { unsigned Reg = CI.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MVT::i32); TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); assert(I != RestoreBlock.begin() && @@ -206,10 +208,10 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { for (unsigned I = 0; CSRegs[I]; ++I) { unsigned Reg = CSRegs[I]; if (SavedRegs.test(Reg)) { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(Reg, MVT::i32); int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), - TRI->getSpillAlignment(*RC), - true); + TRI->getSpillAlign(*RC), true); CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); } @@ -228,6 +230,47 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } +// Find lowest available VGPR and use it as VGPR reserved for SGPR spills. +static bool lowerShiftReservedVGPR(MachineFunction &MF, + const GCNSubtarget &ST) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + Register LowestAvailableVGPR, ReservedVGPR; + ArrayRef<MCPhysReg> AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF); + for (MCPhysReg Reg : AllVGPR32s) { + if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) { + LowestAvailableVGPR = Reg; + break; + } + } + + if (!LowestAvailableVGPR) + return false; + + ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; + const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); + int i = 0; + + for (MachineBasicBlock &MBB : MF) { + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { + if (Reg.VGPR == ReservedVGPR) { + MBB.removeLiveIn(ReservedVGPR); + MBB.addLiveIn(LowestAvailableVGPR); + Optional<int> FI; + if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR)) + FI = FrameInfo.CreateSpillStackObject(4, Align(4)); + + FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i); + } + ++i; + } + MBB.sortUniqueLiveIns(); + } + + return true; +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -267,6 +310,9 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // // This operates under the assumption that only other SGPR spills are users // of the frame index. + + lowerShiftReservedVGPR(MF, ST); + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator Next; for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { @@ -315,6 +361,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { } MadeChange = true; + } else if (FuncInfo->VGPRReservedForSGPRSpill) { + FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); } SaveBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 0c67b1467a5d2..788e9873f780f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -8,6 +8,7 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" #include "SIRegisterInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -52,9 +53,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); - Occupancy = ST.computeOccupancy(MF, getLDSSize()); + Occupancy = ST.computeOccupancy(F, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); + // FIXME: Should have analysis or something rather than attribute to detect + // calls. + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + + // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't + // have any calls. + const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && + (!isEntryFunction() || HasCalls); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { if (!F.arg_empty()) KernargSegmentPtr = true; @@ -68,16 +78,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR33; // TODO: Pick a high register, and shift down, similar to a kernel. - FrameOffsetReg = AMDGPU::SGPR34; + FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(ScratchRSrcReg); - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(ScratchWaveOffsetReg); if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; @@ -89,27 +96,35 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } } - if (F.hasFnAttribute("amdgpu-work-group-id-x")) + if (UseFixedABI) { WorkGroupIDX = true; - - if (F.hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; - - if (F.hasFnAttribute("amdgpu-work-group-id-z")) WorkGroupIDZ = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-x")) WorkItemIDX = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-y")) WorkItemIDY = true; - - if (F.hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + ImplicitArgPtr = true; + } else { + if (F.hasFnAttribute("amdgpu-work-group-id-x")) + WorkGroupIDX = true; + + if (F.hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F.hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F.hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - bool HasStackObjects = FrameInfo.hasStackObjects(); + if (F.hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F.hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + } + bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. @@ -129,36 +144,34 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (isAmdHsaOrMesa) { PrivateSegmentBuffer = true; - if (F.hasFnAttribute("amdgpu-dispatch-ptr")) + if (UseFixedABI) { DispatchPtr = true; - - if (F.hasFnAttribute("amdgpu-queue-ptr")) QueuePtr = true; - if (F.hasFnAttribute("amdgpu-dispatch-id")) + // FIXME: We don't need this? DispatchID = true; + } else { + if (F.hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + + if (F.hasFnAttribute("amdgpu-queue-ptr")) + QueuePtr = true; + + if (F.hasFnAttribute("amdgpu-dispatch-id")) + DispatchID = true; + } } else if (ST.isMesaGfxShader(F)) { ImplicitBufferPtr = true; } - if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) + if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { - auto hasNonSpillStackObjects = [&]() { - // Avoid expensive checking if there's no stack objects. - if (!HasStackObjects) - return false; - for (auto OI = FrameInfo.getObjectIndexBegin(), - OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI) - if (!FrameInfo.isSpillSlotObjectIndex(OI)) - return true; - // All stack objects are spill slots. - return false; - }; // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls that may require it before argument lowering. - if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch")) + // detecting calls or stack objects that may require it before argument + // lowering. + if (HasCalls || HasStackObjects) FlatScratchInit = true; } @@ -184,7 +197,7 @@ void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { MF.getFunction())); } -unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( +Register SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( @@ -193,21 +206,21 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( return ArgInfo.PrivateSegmentBuffer.getRegister(); } -unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.DispatchPtr.getRegister(); } -unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.QueuePtr.getRegister(); } -unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { ArgInfo.KernargSegmentPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); @@ -215,28 +228,29 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return ArgInfo.KernargSegmentPtr.getRegister(); } -unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.DispatchID.getRegister(); } -unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.FlatScratchInit.getRegister(); } -unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.ImplicitBufferPtr.getRegister(); } -static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { +bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, + MCPhysReg Reg) { for (unsigned I = 0; CSRegs[I]; ++I) { if (CSRegs[I] == Reg) return true; @@ -270,22 +284,35 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); unsigned Size = FrameInfo.getObjectSize(FI); - assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size"); - assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); + unsigned NumLanes = Size / 4; + + if (NumLanes > WaveSize) + return false; - int NumLanes = Size / 4; + assert(Size >= 4 && "invalid sgpr spill size"); + assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs"); const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. - for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { - unsigned LaneVGPR; + for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { + Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - if (VGPRIndex == 0) { + // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and + // when one of the two conditions is true: + // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet + // reserved. + // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is + // required. + if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) { + assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR); + LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill; + } else if (VGPRIndex == 0) { LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not @@ -298,7 +325,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, Optional<int> CSRSpillFI; if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) { - CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4); + CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4)); } SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); @@ -317,6 +344,19 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } +/// Reserve a VGPR for spilling of SGPRs +bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + + Register LaneVGPR = TRI->findUnusedRegister( + MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true); + SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None)); + FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR; + return true; +} + /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. /// Either AGPR is spilled to VGPR to vice versa. /// Returns true if a \p FI can be eliminated completely. @@ -386,9 +426,9 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, } void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { - // The FP spill hasn't been inserted yet, so keep it around. + // The FP & BP spills haven't been inserted yet, so keep them around. for (auto &R : SGPRToVGPRSpills) { - if (R.first != FramePointerSaveIndex) + if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) MFI.RemoveStackObject(R.first); } @@ -396,7 +436,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { // ID. for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; ++i) - if (i != FramePointerSaveIndex) + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { @@ -414,7 +454,28 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } -static yaml::StringValue regToString(unsigned Reg, +Register +SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.isAmdPalOS()) + return Register(); + Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in + if (ST.hasMergedShaders()) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + // Low GIT address is passed in s8 rather than s0 for an LS+HS or + // ES+GS merged shader on gfx9+. + GitPtrLo = AMDGPU::SGPR8; + return GitPtrLo; + default: + return GitPtrLo; + } + } + return GitPtrLo; +} + +static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI) { yaml::StringValue Dest; { @@ -487,7 +548,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( WaveLimiter(MFI.needsWaveLimiter()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), - ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), @@ -509,3 +569,21 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; return false; } + +// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs +bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR, + MachineFunction &MF) { + for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) { + if (i->VGPR == ReservedVGPR) { + SpillVGPRs.erase(i); + + for (MachineBasicBlock &MBB : MF) { + MBB.removeLiveIn(ReservedVGPR); + MBB.sortUniqueLiveIns(); + } + this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister; + return true; + } + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ef0186f7d57fe..cf1629fda0aff 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -236,23 +236,29 @@ template <> struct MappingTraits<SIArgumentInfo> { struct SIMode { bool IEEE = true; bool DX10Clamp = true; - bool FP32Denormals = true; - bool FP64FP16Denormals = true; + bool FP32InputDenormals = true; + bool FP32OutputDenormals = true; + bool FP64FP16InputDenormals = true; + bool FP64FP16OutputDenormals = true; SIMode() = default; SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) { IEEE = Mode.IEEE; DX10Clamp = Mode.DX10Clamp; - FP32Denormals = Mode.FP32Denormals; - FP64FP16Denormals = Mode.FP64FP16Denormals; + FP32InputDenormals = Mode.FP32InputDenormals; + FP32OutputDenormals = Mode.FP32OutputDenormals; + FP64FP16InputDenormals = Mode.FP64FP16InputDenormals; + FP64FP16OutputDenormals = Mode.FP64FP16OutputDenormals; } bool operator ==(const SIMode Other) const { return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp && - FP32Denormals == Other.FP32Denormals && - FP64FP16Denormals == Other.FP64FP16Denormals; + FP32InputDenormals == Other.FP32InputDenormals && + FP32OutputDenormals == Other.FP32OutputDenormals && + FP64FP16InputDenormals == Other.FP64FP16InputDenormals && + FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals; } }; @@ -260,8 +266,10 @@ template <> struct MappingTraits<SIMode> { static void mapping(IO &YamlIO, SIMode &Mode) { YamlIO.mapOptional("ieee", Mode.IEEE, true); YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true); - YamlIO.mapOptional("fp32-denormals", Mode.FP32Denormals, true); - YamlIO.mapOptional("fp64-fp16-denormals", Mode.FP64FP16Denormals, true); + YamlIO.mapOptional("fp32-input-denormals", Mode.FP32InputDenormals, true); + YamlIO.mapOptional("fp32-output-denormals", Mode.FP32OutputDenormals, true); + YamlIO.mapOptional("fp64-fp16-input-denormals", Mode.FP64FP16InputDenormals, true); + YamlIO.mapOptional("fp64-fp16-output-denormals", Mode.FP64FP16OutputDenormals, true); } }; @@ -276,7 +284,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; - StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; StringValue FrameOffsetReg = "$fp_reg"; StringValue StackPtrOffsetReg = "$sp_reg"; @@ -303,8 +310,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); - YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, - StringValue("$scratch_wave_offset_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, StringValue("$fp_reg")); YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, @@ -323,20 +328,20 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { class SIMachineFunctionInfo final : public AMDGPUMachineFunction { friend class GCNTargetMachine; - unsigned TIDReg = AMDGPU::NoRegister; + Register TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. - unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; + Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - // This is the current function's incremented size from the kernel's scratch - // wave offset register. For an entry function, this is exactly the same as - // the ScratchWaveOffsetReg. - unsigned FrameOffsetReg = AMDGPU::FP_REG; + // This is the the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the current function's frame. + Register FrameOffsetReg = AMDGPU::FP_REG; - // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. - unsigned StackPtrOffsetReg = AMDGPU::SP_REG; + // This is an ABI register used in the non-entry calling convention to + // communicate the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the new function's frame. + Register StackPtrOffsetReg = AMDGPU::SP_REG; AMDGPUFunctionArgInfo ArgInfo; @@ -429,11 +434,11 @@ private: public: struct SpilledReg { - unsigned VGPR = 0; + Register VGPR; int Lane = -1; SpilledReg() = default; - SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {} + SpilledReg(Register R, int L) : VGPR (R), Lane (L) {} bool hasLane() { return Lane != -1;} bool hasReg() { return VGPR != 0;} @@ -441,13 +446,13 @@ public: struct SGPRSpillVGPRCSR { // VGPR used for SGPR spills - unsigned VGPR; + Register VGPR; // If the VGPR is a CSR, the stack slot used to save/restore it in the // prolog/epilog. Optional<int> FI; - SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {} + SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {} }; struct VGPRSpillToAGPR { @@ -457,12 +462,9 @@ public: SparseBitVector<> WWMReservedRegs; - void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); } private: - // SGPR->VGPR spilling support. - using SpillRegMask = std::pair<unsigned, unsigned>; - // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; @@ -480,9 +482,17 @@ private: public: // FIXME /// If this is set, an SGPR used for save/restore of the register used for the /// frame pointer. - unsigned SGPRForFPSaveRestoreCopy = 0; + Register SGPRForFPSaveRestoreCopy; Optional<int> FramePointerSaveIndex; + /// If this is set, an SGPR used for save/restore of the register used for the + /// base pointer. + Register SGPRForBPSaveRestoreCopy; + Optional<int> BasePointerSaveIndex; + + Register VGPRReservedForSGPRSpill; + bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); + public: SIMachineFunctionInfo(const MachineFunction &MF); @@ -498,6 +508,14 @@ public: return SpillVGPRs; } + void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) { + SpillVGPRs[Index].VGPR = NewVGPR; + SpillVGPRs[Index].FI = newFI; + VGPRReservedForSGPRSpill = NewVGPR; + } + + bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF); + ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const { return SpillAGPR; } @@ -515,12 +533,13 @@ public: bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); void removeDeadFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; - unsigned getTIDReg() const { return TIDReg; }; - void setTIDReg(unsigned Reg) { TIDReg = Reg; } + Register getTIDReg() const { return TIDReg; }; + void setTIDReg(Register Reg) { TIDReg = Reg; } unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; @@ -531,34 +550,34 @@ public: } // Add user SGPRs. - unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); - unsigned addDispatchPtr(const SIRegisterInfo &TRI); - unsigned addQueuePtr(const SIRegisterInfo &TRI); - unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); - unsigned addDispatchID(const SIRegisterInfo &TRI); - unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); + Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + Register addDispatchPtr(const SIRegisterInfo &TRI); + Register addQueuePtr(const SIRegisterInfo &TRI); + Register addKernargSegmentPtr(const SIRegisterInfo &TRI); + Register addDispatchID(const SIRegisterInfo &TRI); + Register addFlatScratchInit(const SIRegisterInfo &TRI); + Register addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. - unsigned addWorkGroupIDX() { + Register addWorkGroupIDX() { ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDX.getRegister(); } - unsigned addWorkGroupIDY() { + Register addWorkGroupIDY() { ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDY.getRegister(); } - unsigned addWorkGroupIDZ() { + Register addWorkGroupIDZ() { ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDZ.getRegister(); } - unsigned addWorkGroupInfo() { + Register addWorkGroupInfo() { ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupInfo.getRegister(); @@ -577,14 +596,14 @@ public: ArgInfo.WorkItemIDZ = Arg; } - unsigned addPrivateSegmentWaveByteOffset() { + Register addPrivateSegmentWaveByteOffset() { ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } - void setPrivateSegmentWaveByteOffset(unsigned Reg) { + void setPrivateSegmentWaveByteOffset(Register Reg) { ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } @@ -660,13 +679,13 @@ public: return ArgInfo; } - std::pair<const ArgDescriptor *, const TargetRegisterClass *> + std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT> getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const { return ArgInfo.getPreloadedValue(Value); } Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { - auto Arg = ArgInfo.getPreloadedValue(Value).first; + auto Arg = std::get<0>(ArgInfo.getPreloadedValue(Value)); return Arg ? Arg->getRegister() : Register(); } @@ -674,6 +693,8 @@ public: return GITPtrHigh; } + Register getGITPtrLoReg(const MachineFunction &MF) const; + uint32_t get32BitAddressHighBits() const { return HighBitsOf32BitAddress; } @@ -690,35 +711,31 @@ public: return NumUserSGPRs + NumSystemSGPRs; } - unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } /// Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. - unsigned getScratchRSrcReg() const { + Register getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(unsigned Reg) { + void setScratchRSrcReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); ScratchRSrcReg = Reg; } - unsigned getScratchWaveOffsetReg() const { - return ScratchWaveOffsetReg; - } - - unsigned getFrameOffsetReg() const { + Register getFrameOffsetReg() const { return FrameOffsetReg; } - void setFrameOffsetReg(unsigned Reg) { + void setFrameOffsetReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); FrameOffsetReg = Reg; } - void setStackPtrOffsetReg(unsigned Reg) { + void setStackPtrOffsetReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; } @@ -727,20 +744,15 @@ public: // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized // MIR. - unsigned getStackPtrOffsetReg() const { + Register getStackPtrOffsetReg() const { return StackPtrOffsetReg; } - void setScratchWaveOffsetReg(unsigned Reg) { - assert(Reg != 0 && "Should never be unset"); - ScratchWaveOffsetReg = Reg; - } - - unsigned getQueuePtrUserSGPR() const { + Register getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } - unsigned getImplicitBufferPtrUserSGPR() const { + Register getImplicitBufferPtrUserSGPR() const { return ArgInfo.ImplicitBufferPtr.getRegister(); } @@ -853,7 +865,7 @@ public: } /// \returns SGPR used for \p Dim's work group ID. - unsigned getWorkGroupIDSGPR(unsigned Dim) const { + Register getWorkGroupIDSGPR(unsigned Dim) const { switch (Dim) { case 0: assert(hasWorkGroupIDX()); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 004a3cb185d62..3ba05aadbbbee 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -269,8 +269,8 @@ SUnit* SIScheduleBlock::pickNode() { // Predict register usage after this instruction. TryCand.SU = SU; TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); - TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; - TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; + TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32]; + TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32]; TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; TryCand.HasLowLatencyNonWaitedParent = @@ -595,10 +595,12 @@ void SIScheduleBlock::printDebug(bool full) { } if (Scheduled) { - dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' - << LiveInPressure[DAG->getVGPRSetID()] << '\n'; - dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' - << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; + dbgs() << "LiveInPressure " + << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' + << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; + dbgs() << "LiveOutPressure " + << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' + << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; dbgs() << "LiveIns:\n"; for (unsigned Reg : LiveInRegs) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; @@ -1637,7 +1639,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); TryCand.VGPRUsageDiff = checkRegUsageImpact(TryCand.Block->getInRegs(), - TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; + TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32]; TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); TryCand.NumHighLatencySuccessors = TryCand.Block->getNumHighLatencySuccessors(); @@ -1796,9 +1798,6 @@ SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { SITII = static_cast<const SIInstrInfo*>(TII); SITRI = static_cast<const SIRegisterInfo*>(TRI); - - VGPRSetID = SITRI->getVGPRPressureSet(); - SGPRSetID = SITRI->getSGPRPressureSet(); } SIScheduleDAGMI::~SIScheduleDAGMI() = default; @@ -1909,9 +1908,9 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, continue; PSetIterator PSetI = MRI.getPressureSets(Reg); for (; PSetI.isValid(); ++PSetI) { - if (*PSetI == VGPRSetID) + if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) VgprUsage += PSetI.getWeight(); - else if (*PSetI == SGPRSetID) + else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32) SgprUsage += PSetI.getWeight(); } } @@ -1952,10 +1951,11 @@ void SIScheduleDAGMI::schedule() int64_t OffLatReg; if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; + bool OffsetIsScalable; if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, - TRI)) + OffsetIsScalable, TRI)) LowLatencyOffset[i] = OffLatReg; - } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) + } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode())) IsHighLatencySU[i] = 1; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index ec450a3164674..02e0a3fe1b610 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -435,9 +435,6 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive { std::vector<unsigned> ScheduledSUnits; std::vector<unsigned> ScheduledSUnitsInv; - unsigned VGPRSetID; - unsigned SGPRSetID; - public: SIScheduleDAGMI(MachineSchedContext *C); @@ -484,9 +481,6 @@ public: return OutRegs; }; - unsigned getVGPRSetID() const { return VGPRSetID; } - unsigned getSGPRSetID() const { return SGPRSetID; } - private: void topologicalSort(); // After scheduling is done, improve low latency placements. diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index e914573306ae0..4e6c72ca20e28 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -254,6 +254,9 @@ protected: IsaVersion IV; + /// Whether to insert cache invalidation instructions. + bool InsertCacheInv; + SICacheControl(const GCNSubtarget &ST); public: @@ -650,6 +653,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( SICacheControl::SICacheControl(const GCNSubtarget &ST) { TII = ST.getInstrInfo(); IV = getIsaVersion(ST.getCPU()); + InsertCacheInv = !ST.isAmdPalOS(); } /* static */ @@ -714,6 +718,9 @@ bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const { + if (!InsertCacheInv) + return false; + bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -852,6 +859,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const { + if (!InsertCacheInv) + return false; + bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -954,6 +964,9 @@ bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const { + if (!InsertCacheInv) + return false; + bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); @@ -1289,6 +1302,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + + if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) { + MachineBasicBlock::instr_iterator II(MI->getIterator()); + for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end(); + I != E && I->isBundledWithPred(); ++I) { + I->unbundleFromPred(); + for (MachineOperand &MO : I->operands()) + if (MO.isReg()) + MO.setIsInternalRead(false); + } + + MI->eraseFromParent(); + MI = II->getIterator(); + } + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 52989a280e806..0e162ac42c111 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -83,9 +83,7 @@ struct Status { return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); } - bool isCombinable(Status &S) { - return !(Mask & S.Mask) || isCompatible(S); - } + bool isCombinable(Status &S) { return !(Mask & S.Mask) || isCompatible(S); } }; class BlockData { @@ -110,7 +108,11 @@ public: // which is used in Phase 3 if we need to insert a mode change. MachineInstr *FirstInsertionPoint; - BlockData() : FirstInsertionPoint(nullptr) {}; + // A flag to indicate whether an Exit value has been set (we can't tell by + // examining the Exit value itself as all values may be valid results). + bool ExitSet; + + BlockData() : FirstInsertionPoint(nullptr), ExitSet(false){}; }; namespace { @@ -131,6 +133,8 @@ public: Status DefaultStatus = Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + bool Changed = false; + public: SIModeRegister() : MachineFunctionPass(ID) {} @@ -201,6 +205,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); ++NumSetregInserted; + Changed = true; InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); } } @@ -325,24 +330,53 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, // exit value is propagated. void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII) { -// BlockData *BI = BlockInfo[MBB.getNumber()]; + bool RevisitRequired = false; + bool ExitSet = false; unsigned ThisBlock = MBB.getNumber(); if (MBB.pred_empty()) { // There are no predecessors, so use the default starting status. BlockInfo[ThisBlock]->Pred = DefaultStatus; + ExitSet = true; } else { // Build a status that is common to all the predecessors by intersecting // all the predecessor exit status values. + // Mask bits (which represent the Mode bits with a known value) can only be + // added by explicit SETREG instructions or the initial default value - + // the intersection process may remove Mask bits. + // If we find a predecessor that has not yet had an exit value determined + // (this can happen for example if a block is its own predecessor) we defer + // use of that value as the Mask will be all zero, and we will revisit this + // block again later (unless the only predecessor without an exit value is + // this block). MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); MachineBasicBlock &PB = *(*P); - BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; + unsigned PredBlock = PB.getNumber(); + if ((ThisBlock == PredBlock) && (std::next(P) == E)) { + BlockInfo[ThisBlock]->Pred = DefaultStatus; + ExitSet = true; + } else if (BlockInfo[PredBlock]->ExitSet) { + BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit; + ExitSet = true; + } else if (PredBlock != ThisBlock) + RevisitRequired = true; for (P = std::next(P); P != E; P = std::next(P)) { MachineBasicBlock *Pred = *P; - BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); + unsigned PredBlock = Pred->getNumber(); + if (BlockInfo[PredBlock]->ExitSet) { + if (BlockInfo[ThisBlock]->ExitSet) { + BlockInfo[ThisBlock]->Pred = + BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[PredBlock]->Exit); + } else { + BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit; + } + ExitSet = true; + } else if (PredBlock != ThisBlock) + RevisitRequired = true; } } - Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); + Status TmpStatus = + BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); if (BlockInfo[ThisBlock]->Exit != TmpStatus) { BlockInfo[ThisBlock]->Exit = TmpStatus; // Add the successors to the work list so we can propagate the changed exit @@ -354,6 +388,9 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, Phase2List.push(&B); } } + BlockInfo[ThisBlock]->ExitSet = ExitSet; + if (RevisitRequired) + Phase2List.push(&MBB); } // In Phase 3 we revisit each block and if it has an insertion point defined we @@ -361,10 +398,10 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, // not we insert an appropriate setreg instruction to modify the Mode register. void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII) { -// BlockData *BI = BlockInfo[MBB.getNumber()]; unsigned ThisBlock = MBB.getNumber(); if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { - Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); + Status Delta = + BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); if (BlockInfo[ThisBlock]->FirstInsertionPoint) insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); else @@ -401,5 +438,5 @@ bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { BlockInfo.clear(); - return NumSetregInserted > 0; + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 34199d3e425c4..8af00fcf62a82 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -7,15 +7,8 @@ //===----------------------------------------------------------------------===// // /// \file -/// This pass removes redundant S_OR_B64 instructions enabling lanes in -/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any -/// vector instructions between them we can only keep outer SI_END_CF, given -/// that CFG is structured and exec bits of the outer end statement are always -/// not less than exec bit of the inner one. -/// -/// This needs to be done before the RA to eliminate saved exec bits registers -/// but after register coalescer to have no vector registers copies in between -/// of different end cf statements. +/// This pass performs exec mask handling peephole optimizations which needs +/// to be done before register allocation to reduce register pressure. /// //===----------------------------------------------------------------------===// @@ -40,14 +33,6 @@ private: MachineRegisterInfo *MRI; public: - MachineBasicBlock::iterator skipIgnoreExecInsts( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const; - - MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc( - MachineBasicBlock *&MBB, - MachineBasicBlock::iterator It) const; - -public: static char ID; SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) { @@ -83,93 +68,15 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { return new SIOptimizeExecMaskingPreRA(); } -static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI, - const GCNSubtarget &ST) { - if (ST.isWave32()) { - return MI.getOpcode() == AMDGPU::S_OR_B32 && - MI.modifiesRegister(AMDGPU::EXEC_LO, TRI); - } - - return MI.getOpcode() == AMDGPU::S_OR_B64 && - MI.modifiesRegister(AMDGPU::EXEC, TRI); -} - static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) { unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) { - assert(MI.isFullCopy()); + if (MI.isFullCopy() && MI.getOperand(1).getReg() == Exec) return true; - } return false; } -static unsigned getOrNonExecReg(const MachineInstr &MI, - const SIInstrInfo &TII, - const GCNSubtarget& ST) { - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1); - if (Op->isReg() && Op->getReg() != Exec) - return Op->getReg(); - Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0); - if (Op->isReg() && Op->getReg() != Exec) - return Op->getReg(); - return AMDGPU::NoRegister; -} - -static MachineInstr* getOrExecSource(const MachineInstr &MI, - const SIInstrInfo &TII, - const MachineRegisterInfo &MRI, - const GCNSubtarget& ST) { - auto SavedExec = getOrNonExecReg(MI, TII, ST); - if (SavedExec == AMDGPU::NoRegister) - return nullptr; - auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec); - if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST)) - return nullptr; - return SaveExecInst; -} - -/// Skip over instructions that don't care about the exec mask. -MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const { - for ( ; I != E; ++I) { - if (TII->mayReadEXEC(*MRI, *I)) - break; - } - - return I; -} - -// Skip to the next instruction, ignoring debug instructions, and trivial block -// boundaries (blocks that have one (typically fallthrough) successor, and the -// successor has one predecessor. -MachineBasicBlock::iterator -SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc( - MachineBasicBlock *&MBB, - MachineBasicBlock::iterator It) const { - - do { - It = skipIgnoreExecInsts(It, MBB->end()); - if (It != MBB->end() || MBB->succ_size() != 1) - break; - - // If there is one trivial successor, advance to the next block. - MachineBasicBlock *Succ = *MBB->succ_begin(); - - // TODO: Is this really necessary? - if (!MBB->isLayoutSuccessor(Succ)) - break; - - It = Succ->begin(); - MBB = Succ; - } while (true); - - return It; -} - - // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc // %cmp = V_CMP_NE_U32 1, %1 @@ -261,6 +168,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, And->getOperand(0).getReg()) .addReg(ExecReg) .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg()); + MachineOperand &AndSCC = And->getOperand(3); + assert(AndSCC.getReg() == AMDGPU::SCC); + MachineOperand &Andn2SCC = Andn2->getOperand(3); + assert(Andn2SCC.getReg() == AMDGPU::SCC); + Andn2SCC.setIsDead(AndSCC.isDead()); And->eraseFromParent(); LIS->InsertMachineInstrInMaps(*Andn2); @@ -379,57 +291,30 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { continue; } - // Try to collapse adjacent endifs. - auto E = MBB.end(); - auto Lead = skipDebugInstructionsForward(MBB.begin(), E); - if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST)) - continue; - - MachineBasicBlock *TmpMBB = &MBB; - auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead)); - if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) || - !getOrExecSource(*NextLead, *TII, MRI, ST)) - continue; - - LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); - - auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST); - unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST); - for (auto &Op : Lead->operands()) { - if (Op.isReg()) - RecalcRegs.insert(Op.getReg()); - } - - LIS->RemoveMachineInstrFromMaps(*Lead); - Lead->eraseFromParent(); - if (SaveExecReg) { - LIS->removeInterval(SaveExecReg); - LIS->createAndComputeVirtRegInterval(SaveExecReg); - } - - Changed = true; - - // If the only use of saved exec in the removed instruction is S_AND_B64 - // fold the copy now. - if (!SaveExec || !SaveExec->isFullCopy()) - continue; + // If the only user of a logical operation is move to exec, fold it now + // to prevent forming of saveexec. I.e: + // + // %0:sreg_64 = COPY $exec + // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64 + // => + // %1 = S_AND_B64 $exec, %2:sreg_64 + unsigned ScanThreshold = 10; + for (auto I = MBB.rbegin(), E = MBB.rend(); I != E + && ScanThreshold--; ++I) { + if (!isFullExecCopy(*I, ST)) + continue; - Register SavedExec = SaveExec->getOperand(0).getReg(); - bool SafeToReplace = true; - for (auto& U : MRI.use_nodbg_instructions(SavedExec)) { - if (U.getParent() != SaveExec->getParent()) { - SafeToReplace = false; - break; + Register SavedExec = I->getOperand(0).getReg(); + if (SavedExec.isVirtual() && MRI.hasOneNonDBGUse(SavedExec) && + MRI.use_instr_nodbg_begin(SavedExec)->getParent() == I->getParent()) { + LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n'); + LIS->RemoveMachineInstrFromMaps(*I); + I->eraseFromParent(); + MRI.replaceRegWith(SavedExec, Exec); + LIS->removeInterval(SavedExec); + Changed = true; } - - LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n'); - } - - if (SafeToReplace) { - LIS->RemoveMachineInstrFromMaps(*SaveExec); - SaveExec->eraseFromParent(); - MRI.replaceRegWith(SavedExec, Exec); - LIS->removeInterval(SavedExec); + break; } } diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 05c81feb23ecd..9a1855c3458be 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -244,11 +244,6 @@ static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { return OS; } -static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { - Operand.print(OS); - return OS; -} - LLVM_DUMP_METHOD void SDWASrcOperand::print(raw_ostream& OS) const { OS << "SDWA src: " << *getTargetOperand() @@ -850,6 +845,13 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { return std::unique_ptr<SDWAOperand>(nullptr); } +#if !defined(NDEBUG) +static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { + Operand.print(OS); + return OS; +} +#endif + void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { for (MachineInstr &MI : MBB) { if (auto Operand = matchSDWAOperand(MI)) { @@ -920,18 +922,24 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, if (I->modifiesRegister(AMDGPU::VCC, TRI)) return; } + // Make the two new e32 instruction variants. // Replace MI with V_{SUB|ADD}_I32_e32 - auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) + .setMIFlags(MI.getFlags()); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 - auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); - NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) + .setMIFlags(MISucc.getFlags()); + MISucc.eraseFromParent(); } @@ -1008,7 +1016,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, // Create SDWA version of instruction MI and initialize its operands MachineInstrBuilder SDWAInst = - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) + .setMIFlags(MI.getFlags()); // Copy dst, if it is present in original then should also be present in SDWA MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp new file mode 100644 index 0000000000000..4c72fa2359750 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -0,0 +1,139 @@ +//===-- SIPostRABundler.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass creates bundles of memory instructions to protect adjacent loads +/// and stores from beeing rescheduled apart from each other post-RA. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-post-ra-bundler" + +namespace { + +class SIPostRABundler : public MachineFunctionPass { +public: + static char ID; + +public: + SIPostRABundler() : MachineFunctionPass(ID) { + initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI post-RA bundler"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + const SIRegisterInfo *TRI; + + SmallSet<Register, 16> Defs; + + bool isDependentLoad(const MachineInstr &MI) const; + +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false) + +char SIPostRABundler::ID = 0; + +char &llvm::SIPostRABundlerID = SIPostRABundler::ID; + +FunctionPass *llvm::createSIPostRABundlerPass() { + return new SIPostRABundler(); +} + +bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const { + if (!MI.mayLoad()) + return false; + + for (const MachineOperand &Op : MI.explicit_operands()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + for (Register Def : Defs) + if (TRI->regsOverlap(Reg, Def)) + return true; + } + + return false; +} + +bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); + bool Changed = false; + const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF | + SIInstrFlags::SMRD | SIInstrFlags::DS | + SIInstrFlags::FLAT | SIInstrFlags::MIMG; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::instr_iterator Next; + MachineBasicBlock::instr_iterator B = MBB.instr_begin(); + MachineBasicBlock::instr_iterator E = MBB.instr_end(); + for (auto I = B; I != E; I = Next) { + Next = std::next(I); + + const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags; + + if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() || + B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() || + ((B->getDesc().TSFlags & MemFlags) != IMemFlags) || + isDependentLoad(*I)) { + + if (B != I) { + if (std::next(B) != I) { + finalizeBundle(MBB, B, I); + Changed = true; + } + Next = I; + } + + B = Next; + Defs.clear(); + continue; + } + + if (I->getNumExplicitDefs() == 0) + continue; + + Defs.insert(I->defs().begin()->getReg()); + } + + if (B != E && std::next(B) != E) { + finalizeBundle(MBB, B, E); + Changed = true; + } + + Defs.clear(); + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp new file mode 100644 index 0000000000000..f31c722db1b26 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -0,0 +1,326 @@ +//===-- SIPreEmitPeephole.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass performs the peephole optimizations before code emission. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-pre-emit-peephole" + +namespace { + +class SIPreEmitPeephole : public MachineFunctionPass { +private: + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + + bool optimizeVccBranch(MachineInstr &MI) const; + bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; + +public: + static char ID; + + SIPreEmitPeephole() : MachineFunctionPass(ID) { + initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, + "SI peephole optimizations", false, false) + +char SIPreEmitPeephole::ID = 0; + +char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; + +bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 or 0 + // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + // We end up with this pattern sometimes after basic block placement. + // It happens while combining a block which assigns -1 or 0 to a saved mask + // and another block which consumes that saved mask and then a branch. + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); + const bool IsWave32 = ST.isWave32(); + const unsigned CondReg = TRI->getVCC(); + const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A; A != E; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || + (A->getOpcode() != And && A->getOpcode() != AndN2)) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) + return Changed; + + int64_t MaskValue = 0; + Register SReg; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for (; M != E; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || + (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) + return Changed; + MaskValue = M->getOperand(1).getImm(); + // First if sreg is only used in the AND instruction fold the immediate + // into into the AND. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(MaskValue); + M->eraseFromParent(); + } + } else if (Op2.isImm()) { + MaskValue = Op2.getImm(); + } else { + llvm_unreachable("Op2 must be register or immediate"); + } + + // Invert mask for s_andn2 + assert(MaskValue == 0 || MaskValue == -1); + if (A->getOpcode() == AndN2) + MaskValue = ~MaskValue; + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + // EXEC is updated directly + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else if (IsVCCZ && MaskValue == 0) { + // Will always branch + // Remove all succesors shadowed by new unconditional branch + MachineBasicBlock *Parent = MI.getParent(); + SmallVector<MachineInstr *, 4> ToRemove; + bool Found = false; + for (MachineInstr &Term : Parent->terminators()) { + if (Found) { + if (Term.isBranch()) + ToRemove.push_back(&Term); + } else { + Found = Term.isIdenticalTo(MI); + } + } + assert(Found && "conditional branch is not terminator"); + for (auto BranchMI : ToRemove) { + MachineOperand &Dst = BranchMI->getOperand(0); + assert(Dst.isMBB() && "destination is not basic block"); + Parent->removeSuccessor(Dst.getMBB()); + BranchMI->eraseFromParent(); + } + + if (MachineBasicBlock *Succ = Parent->getFallThrough()) { + Parent->removeSuccessor(Succ); + } + + // Rewrite to unconditional branch + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else if (!IsVCCZ && MaskValue == 0) { + // Will never branch + MachineOperand &Dst = MI.getOperand(0); + assert(Dst.isMBB() && "destination is not basic block"); + MI.getParent()->removeSuccessor(Dst.getMBB()); + MI.eraseFromParent(); + return true; + } else if (MaskValue == -1) { + // Depends only on EXEC + MI.setDesc( + TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + +bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, + MachineInstr &MI) const { + MachineBasicBlock &MBB = *MI.getParent(); + const MachineFunction &MF = *MBB.getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); + SmallVector<MachineInstr *, 4> ToRemove; + bool IdxOn = true; + + if (!MI.isIdenticalTo(First)) + return false; + + // Scan back to find an identical S_SET_GPR_IDX_ON + for (MachineBasicBlock::iterator I = std::next(First.getIterator()), + E = MI.getIterator(); I != E; ++I) { + switch (I->getOpcode()) { + case AMDGPU::S_SET_GPR_IDX_MODE: + return false; + case AMDGPU::S_SET_GPR_IDX_OFF: + IdxOn = false; + ToRemove.push_back(&*I); + break; + default: + if (I->modifiesRegister(AMDGPU::M0, TRI)) + return false; + if (IdxReg && I->modifiesRegister(IdxReg, TRI)) + return false; + if (llvm::any_of(I->operands(), + [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && + TRI->isVectorRegister(MRI, MO.getReg()); + })) { + // The only exception allowed here is another indirect vector move + // with the same mode. + if (!IdxOn || + !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 && + I->hasRegisterImplicitUseOperand(AMDGPU::M0)) || + I->getOpcode() == AMDGPU::V_MOV_B32_indirect)) + return false; + } + } + } + + MI.eraseFromParent(); + for (MachineInstr *RI : ToRemove) + RI->eraseFromParent(); + return true; +} + +bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); + if (MBBE != MBB.end()) { + MachineInstr &MI = *MBBE; + switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + Changed |= optimizeVccBranch(MI); + continue; + case AMDGPU::SI_RETURN_TO_EPILOG: + // FIXME: This is not an optimization and should be + // moved somewhere else. + assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (&MBB != &MF.back() || &MI != &MBB.back()) { + // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block + // at the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + MI.eraseFromParent(); + MBBE = MBB.getFirstTerminator(); + } + break; + default: + break; + } + } + + if (!ST.hasVGPRIndexMode()) + continue; + + MachineInstr *SetGPRMI = nullptr; + const unsigned Threshold = 20; + unsigned Count = 0; + // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a + // second is not needed. Do expensive checks in the optimizeSetGPR() + // and limit the distance to 20 instructions for compile time purposes. + for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { + MachineInstr &MI = *MBBI; + ++MBBI; + + if (Count == Threshold) + SetGPRMI = nullptr; + else + ++Count; + + if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) + continue; + + Count = 0; + if (!SetGPRMI) { + SetGPRMI = &MI; + continue; + } + + if (optimizeSetGPR(*SetGPRMI, MI)) + Changed = true; + else + SetGPRMI = &MI; + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index fbadad3c84ad8..5d6009ebf3843 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -26,27 +26,12 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" +#include <vector> using namespace llvm; -static bool hasPressureSet(const int *PSets, unsigned PSetID) { - for (unsigned i = 0; PSets[i] != -1; ++i) { - if (PSets[i] == (int)PSetID) - return true; - } - return false; -} - -void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, - BitVector &PressureSets) const { - for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { - const int *PSets = getRegUnitPressureSets(*U); - if (hasPressureSet(PSets, PSetID)) { - PressureSets.set(PSetID); - break; - } - } -} +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" static cl::opt<bool> EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", @@ -54,90 +39,200 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); -SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : - AMDGPURegisterInfo(), - ST(ST), - SGPRPressureSets(getNumRegPressureSets()), - VGPRPressureSets(getNumRegPressureSets()), - AGPRPressureSets(getNumRegPressureSets()), - SpillSGPRToVGPR(EnableSpillSGPRToVGPR), - isWave32(ST.isWave32()) { - unsigned NumRegPressureSets = getNumRegPressureSets(); - - SGPRSetID = NumRegPressureSets; - VGPRSetID = NumRegPressureSets; - AGPRSetID = NumRegPressureSets; - - for (unsigned i = 0; i < NumRegPressureSets; ++i) { - classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); - classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); - classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); - } - - // Determine the number of reg units for each pressure set. - std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0); - for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) { - const int *PSets = getRegUnitPressureSets(i); - for (unsigned j = 0; PSets[j] != -1; ++j) { - ++PressureSetRegUnits[PSets[j]]; +std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts; + +SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) + : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) { + + assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 && + getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) && + (getSubRegIndexLaneMask(AMDGPU::lo16) | + getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() == + getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() && + "getNumCoveredRegs() will not work with generated subreg masks!"); + + RegPressureIgnoredUnits.resize(getNumRegUnits()); + RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this)); + for (auto Reg : AMDGPU::VGPR_HI16RegClass) + RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); + + // HACK: Until this is fully tablegen'd. + static llvm::once_flag InitializeRegSplitPartsFlag; + + static auto InitializeRegSplitPartsOnce = [this]() { + for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { + unsigned Size = getSubRegIdxSize(Idx); + if (Size & 31) + continue; + std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1]; + unsigned Pos = getSubRegIdxOffset(Idx); + if (Pos % Size) + continue; + Pos /= Size; + if (Vec.empty()) { + unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. + Vec.resize(MaxNumParts); + } + Vec[Pos] = Idx; } + }; + + + llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce); +} + +void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, + MCRegister Reg) const { + MCRegAliasIterator R(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); +} + +// Forced to be here by one .inc +const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( + const MachineFunction *MF) const { + CallingConv::ID CC = MF->getFunction().getCallingConv(); + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::Cold: + return CSR_AMDGPU_HighRegs_SaveList; + default: { + // Dummy to not crash RegisterClassInfo. + static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; + return &NoCalleeSavedReg; } + } +} - unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; - for (unsigned i = 0; i < NumRegPressureSets; ++i) { - if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { - VGPRSetID = i; - VGPRMax = PressureSetRegUnits[i]; - continue; - } - if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) { - SGPRSetID = i; - SGPRMax = PressureSetRegUnits[i]; - } - if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { - AGPRSetID = i; - AGPRMax = PressureSetRegUnits[i]; - continue; - } +const MCPhysReg * +SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { + return nullptr; +} + +const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::Cold: + return CSR_AMDGPU_HighRegs_RegMask; + default: + return nullptr; } +} - assert(SGPRSetID < NumRegPressureSets && - VGPRSetID < NumRegPressureSets && - AGPRSetID < NumRegPressureSets); +Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const SIFrameLowering *TFI = + MF.getSubtarget<GCNSubtarget>().getFrameLowering(); + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + // During ISel lowering we always reserve the stack pointer in entry + // functions, but never actually want to reference it when accessing our own + // frame. If we need a frame pointer we use it, but otherwise we can just use + // an immediate "0" which we represent by returning NoRegister. + if (FuncInfo->isEntryFunction()) { + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); + } + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() + : FuncInfo->getStackPtrOffsetReg(); } -unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( - const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; - unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); +bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + // When we need stack realignment, we can't reference off of the + // stack pointer, so we reserve a base pointer. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.getNumFixedObjects() && needsStackRealignment(MF); } -static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { - unsigned Reg; +Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } - // Try to place it in a hole after PrivateSegmentBufferReg. - if (RegCount & 3) { - // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to - // alignment constraints, so we have a hole where can put the wave offset. - Reg = RegCount - 1; - } else { - // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the - // wave offset before it. - Reg = RegCount - 5; - } +const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { + return CSR_AMDGPU_AllVGPRs_RegMask; +} - return Reg; +const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { + return CSR_AMDGPU_AllAllocatableSRegs_RegMask; } -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( +// FIXME: TableGen should generate something to make this manageable for all +// register classes. At a minimum we could use the opposite of +// composeSubRegIndices and go up from the base 32-bit subreg. +unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, + unsigned NumRegs) { + // Table of NumRegs sized pieces at every 32-bit offset. + static const uint16_t SubRegFromChannelTable[][32] = { + {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31}, + {AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, + AMDGPU::sub3_sub4, AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, + AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, AMDGPU::sub8_sub9, + AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, + AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, + AMDGPU::sub15_sub16, AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, + AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, AMDGPU::sub20_sub21, + AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, + AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, + AMDGPU::sub27_sub28, AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, + AMDGPU::sub30_sub31, AMDGPU::NoSubRegister}, + {AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, + AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, + AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, + AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, + AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, + AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, + AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, + AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, + AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, + AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, + AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, + AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, + AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, + AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, + AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, + AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}, + {AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, + AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, + AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, + AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, + AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, + AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, + AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, + AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, + AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, + AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, + AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, + AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, + AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, + AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, + AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, + AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}}; + + const unsigned NumRegIndex = NumRegs - 1; + + assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && + "Not implemented"); + assert(Channel < array_lengthof(SubRegFromChannelTable[0])); + return SubRegFromChannelTable[NumRegIndex][Channel]; +} + +MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); - return AMDGPU::SGPR_32RegClass.getRegister(Reg); + unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; + MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); + Reserved.set(AMDGPU::MODE); // EXEC_LO and EXEC_HI could be allocated and used as regular register, but // this seems likely to result in bugs, so I'm marking them as reserved. @@ -205,6 +300,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } + for (auto Reg : AMDGPU::SReg_32RegClass) { + Reserved.set(getSubReg(Reg, AMDGPU::hi16)); + Register Low = getSubReg(Reg, AMDGPU::lo16); + // This is to prevent BB vcc liveness errors. + if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) + Reserved.set(Low); + } + + for (auto Reg : AMDGPU::AGPR_32RegClass) { + Reserved.set(getSubReg(Reg, AMDGPU::hi16)); + } + // Reserve all the rest AGPRs if there are no instructions to use it. if (!ST.hasMAIInsts()) { for (unsigned i = 0; i < MaxNumVGPRs; ++i) { @@ -215,38 +322,37 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - // Reserve 1 SGPR for scratch wave offset in case we need to spill. - reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); - } - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } // We have to assume the SP is needed in case there are calls in the function, // which is detected after the function is lowered. If we aren't really going // to need SP, don't bother reserving it. - unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + MCRegister StackPtrReg = MFI->getStackPtrOffsetReg(); - if (StackPtrReg != AMDGPU::NoRegister) { + if (StackPtrReg) { reserveRegisterTuples(Reserved, StackPtrReg); assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); } - unsigned FrameReg = MFI->getFrameOffsetReg(); - if (FrameReg != AMDGPU::NoRegister) { + MCRegister FrameReg = MFI->getFrameOffsetReg(); + if (FrameReg) { reserveRegisterTuples(Reserved, FrameReg); assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } - for (unsigned Reg : MFI->WWMReservedRegs) { + if (hasBasePointer(MF)) { + MCRegister BasePtrReg = getBaseRegister(); + reserveRegisterTuples(Reserved, BasePtrReg); + assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); + } + + for (MCRegister Reg : MFI->WWMReservedRegs) { reserveRegisterTuples(Reserved, Reg); } @@ -257,6 +363,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); + if (MFI->VGPRReservedForSGPRSpill) + for (auto SSpill : MFI->getSGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, SSpill.VGPR); + return Reserved; } @@ -305,11 +415,6 @@ bool SIRegisterInfo::requiresVirtualBaseRegisters( return true; } -bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // This helps catch bugs as verifier errors. - return true; -} - int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const { assert(SIInstrInfo::isMUBUF(*MI)); @@ -340,7 +445,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { } void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, + Register BaseReg, int FrameIdx, int64_t Offset) const { MachineBasicBlock::iterator Ins = MBB->begin(); @@ -374,7 +479,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, .addImm(0); // clamp bit } -void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, +void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { const SIInstrInfo *TII = ST.getInstrInfo(); @@ -411,7 +516,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, } bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, - unsigned BaseReg, + Register BaseReg, int64_t Offset) const { if (!SIInstrInfo::isMUBUF(*MI)) return false; @@ -451,6 +556,11 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: return 8; + case AMDGPU::SI_SPILL_S192_SAVE: + case AMDGPU::SI_SPILL_S192_RESTORE: + case AMDGPU::SI_SPILL_V192_SAVE: + case AMDGPU::SI_SPILL_V192_RESTORE: + return 6; case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_V160_SAVE: @@ -614,10 +724,10 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index, - unsigned ValueReg, + Register ValueReg, bool IsKill, - unsigned ScratchRsrcReg, - unsigned ScratchOffsetReg, + MCRegister ScratchRsrcReg, + MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, RegScavenger *RS) const { @@ -625,13 +735,14 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); + const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); const MCInstrDesc &Desc = TII->get(LoadStoreOp); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = Desc.mayStore(); bool Scavenged = false; - unsigned SOffset = ScratchOffsetReg; + MCRegister SOffset = ScratchOffsetReg; const unsigned EltSize = 4; const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); @@ -640,7 +751,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, int64_t Offset = InstOffset + MFI.getObjectOffset(Index); int64_t ScratchOffsetRegDelta = 0; - unsigned Align = MFI.getObjectAlignment(Index); + Align Alignment = MFI.getObjectAlign(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); Register TmpReg = @@ -650,7 +761,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); if (!isUInt<12>(Offset + Size - EltSize)) { - SOffset = AMDGPU::NoRegister; + SOffset = MCRegister(); // We currently only support spilling VGPRs to EltSize boundaries, meaning // we can simplify the adjustment of Offset here to just scale with @@ -662,23 +773,33 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, if (RS) SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); - if (SOffset == AMDGPU::NoRegister) { + if (!SOffset) { // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar // stores), we have no way to free up an SGPR. Our solution here is to - // add the offset directly to the ScratchOffset register, and then - // subtract the offset after the spill to return ScratchOffset to it's - // original value. + // add the offset directly to the ScratchOffset or StackPtrOffset + // register, and then subtract the offset after the spill to return the + // register to it's original value. + if (!ScratchOffsetReg) + ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg(); SOffset = ScratchOffsetReg; ScratchOffsetRegDelta = Offset; } else { Scavenged = true; } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffsetReg) - .addImm(Offset); + if (!SOffset) + report_fatal_error("could not scavenge SGPR to spill in entry function"); + + if (ScratchOffsetReg == AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffsetReg) + .addImm(Offset); + } Offset = 0; } @@ -708,21 +829,26 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i); - MachineMemOperand *NewMMO - = MF->getMachineMemOperand(PInfo, MMO->getFlags(), - EltSize, MinAlign(Align, EltSize * i)); + MachineMemOperand *NewMMO = + MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize, + commonAlignment(Alignment, EltSize * i)); MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(NewMMO); + .addReg(SubReg, + getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg); + if (SOffset == AMDGPU::NoRegister) { + MIB.addImm(0); + } else { + MIB.addReg(SOffset, SOffsetRegState); + } + MIB.addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), @@ -736,12 +862,124 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, if (ScratchOffsetRegDelta != 0) { // Subtract the offset we added to the ScratchOffset register. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) - .addReg(ScratchOffsetReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset) + .addReg(SOffset) .addImm(ScratchOffsetRegDelta); } } +// Generate a VMEM access which loads or stores the VGPR containing an SGPR +// spill such that all the lanes set in VGPRLanes are loaded or stored. +// This generates exec mask manipulation and will use SGPRs available in MI +// or VGPR lanes in the VGPR to save and restore the exec mask. +void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, + int Index, int Offset, + unsigned EltSize, Register VGPR, + int64_t VGPRLanes, + RegScavenger *RS, + bool IsLoad) const { + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + Register SuperReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); + unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); + unsigned FirstPart = Offset * 32; + unsigned ExecLane = 0; + + bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + // Cannot handle load/store to EXEC + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); + + // On Wave32 only handle EXEC_LO. + // On Wave64 only update EXEC_HI if there is sufficent space for a copy. + bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; + + unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + Register SavedExecReg; + + // Backup EXEC + if (OnlyExecLo) { + SavedExecReg = NumSubRegs == 1 + ? SuperReg + : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]); + } else { + // If src/dst is an odd size it is possible subreg0 is not aligned. + for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { + SavedExecReg = getMatchingSuperReg( + getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, + &AMDGPU::SReg_64_XEXECRegClass); + if (SavedExecReg) + break; + } + } + assert(SavedExecReg); + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); + + // Setup EXEC + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); + + // Load/store VGPR + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill); + + Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) + ? getBaseRegister() + : getFrameRegister(*MF); + + Align Alignment = FrameInfo.getObjectAlign(Index); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore, + EltSize, Alignment); + + if (IsLoad) { + buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + Index, + VGPR, false, + MFI->getScratchRSrcReg(), FrameReg, + Offset * EltSize, MMO, + RS); + } else { + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR, + IsKill, MFI->getScratchRSrcReg(), FrameReg, + Offset * EltSize, MMO, RS); + // This only ever adds one VGPR spill + MFI->addToSpilledVGPRs(1); + } + + // Restore EXEC + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) + .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); + + // Restore clobbered SGPRs + if (IsLoad) { + // Nothing to do; register will be overwritten + } else if (!IsKill) { + // Restore SGPRs from appropriate VGPR lanes + if (!OnlyExecLo) { + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) + .addReg(VGPR) + .addImm(ExecLane + 1); + } + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + NumSubRegs == 1 + ? SavedExecReg + : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])) + .addReg(VGPR, RegState::Kill) + .addImm(ExecLane); + } +} + bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, @@ -749,7 +987,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - DenseSet<unsigned> SGPRSpillVGPRDefinedSet; + DenseSet<Register> SGPRSpillVGPRDefinedSet; ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = MFI->getSGPRToVGPRSpills(Index); @@ -763,13 +1001,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && - SuperReg != MFI->getFrameOffsetReg() && - SuperReg != MFI->getScratchWaveOffsetReg())); + SuperReg != MFI->getFrameOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); @@ -777,17 +1014,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - // Scavenged temporary VGPR to use. It must be scavenged once for any number - // of spilled subregs. - Register TmpVGPR; - - // SubReg carries the "Kill" flag when SubReg == SuperReg. - unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = - NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - - if (SpillToVGPR) { + if (SpillToVGPR) { + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; // During SGPR spilling to VGPR, determine if the VGPR is defined. The @@ -809,42 +1039,53 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. - } else { - // XXX - Can to VGPR spill fail for some subregisters but not others? - if (OnlyToVGPR) - return false; - - // Spill SGPR to a frame index. - if (!TmpVGPR.isValid()) - TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - MachineInstrBuilder Mov - = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) - .addReg(SubReg, SubKillState); - - // There could be undef components of a spilled super register. - // TODO: Can we detect this and skip the spill? - if (NumSubRegs > 1) { - // The last implicit use of the SuperReg carries the "Kill" flag. - unsigned SuperKillState = 0; - if (i + 1 == e) - SuperKillState |= getKillRegState(IsKill); - Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); + } + } else { + // Scavenged temporary VGPR to use. It must be scavenged once for any number + // of spilled subregs. + Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + RS->setRegUsed(TmpVGPR); + + // SubReg carries the "Kill" flag when SubReg == SuperReg. + unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); + + unsigned PerVGPR = 32; + unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; + int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + + for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + unsigned TmpVGPRFlags = RegState::Undef; + + // Write sub registers into the VGPR + for (unsigned i = Offset * PerVGPR, + e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + + MachineInstrBuilder WriteLane = + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + TmpVGPR) + .addReg(SubReg, SubKillState) + .addImm(i % PerVGPR) + .addReg(TmpVGPR, TmpVGPRFlags); + TmpVGPRFlags = 0; + + // There could be undef components of a spilled super register. + // TODO: Can we detect this and skip the spill? + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == NumSubRegs) + SuperKillState |= getKillRegState(IsKill); + WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState); + } } - unsigned Align = FrameInfo.getObjectAlignment(Index); - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - EltSize, MinAlign(Align, EltSize * i)); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) - .addReg(TmpVGPR, RegState::Kill) // src - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srrsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); + // Write out VGPR + buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, + RS, false); } } @@ -867,13 +1108,14 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (OnlyToVGPR && !SpillToVGPR) return false; - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); Register SuperReg = MI->getOperand(0).getReg(); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); unsigned EltSize = 4; @@ -882,52 +1124,49 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - Register TmpVGPR; - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - Register SubReg = - NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + if (SpillToVGPR) { + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); - if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); - if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); - } else { - if (OnlyToVGPR) - return false; - - // Restore SGPR from a stack slot. - // FIXME: We should use S_LOAD_DWORD here for VI. - if (!TmpVGPR.isValid()) - TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - unsigned Align = FrameInfo.getObjectAlignment(Index); - - MachinePointerInfo PtrInfo - = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); - - MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, - MachineMemOperand::MOLoad, EltSize, - MinAlign(Align, EltSize * i)); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR) - .addFrameIndex(Index) // vaddr - .addReg(MFI->getScratchRSrcReg()) // srsrc - .addReg(MFI->getStackPtrOffsetReg()) // soffset - .addImm(i * 4) // offset - .addMemOperand(MMO); - - auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) - .addReg(TmpVGPR, RegState::Kill); - - if (NumSubRegs > 1) - MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + } else { + Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + RS->setRegUsed(TmpVGPR); + + unsigned PerVGPR = 32; + unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; + int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; + + for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) { + // Load in VGPR data + buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, + RS, true); + + // Unpack lanes + for (unsigned i = Offset * PerVGPR, + e = std::min((Offset + 1) * PerVGPR, NumSubRegs); + i < e; ++i) { + Register SubReg = + NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); + + bool LastSubReg = (i + 1 == e); + auto MIB = + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) + .addReg(TmpVGPR, getKillRegState(LastSubReg)) + .addImm(i); + if (NumSubRegs > 1 && i == 0) + MIB.addReg(SuperReg, RegState::ImplicitDefine); + } } } @@ -946,6 +1185,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S96_SAVE: @@ -955,6 +1195,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S192_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S96_RESTORE: @@ -981,13 +1222,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); - Register FrameReg = getFrameRegister(*MF); + Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF) + ? getBaseRegister() + : getFrameRegister(*MF); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S96_SAVE: @@ -1001,6 +1245,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S192_RESTORE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S96_RESTORE: @@ -1076,42 +1321,30 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { - // Convert to an absolute stack address by finding the offset from the - // scratch wave base and scaling by the wave size. + // Convert to a swizzled stack address by scaling by the wave size. // - // In an entry function/kernel the offset is already the absolute - // address relative to the frame register. - - Register TmpDiffReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - - // If there's no free SGPR, in-place modify the FP - Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; + // In an entry function/kernel the offset is already swizzled. bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = IsCopy ? - MI->getOperand(0).getReg() : - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); + Register ResultReg = + IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg); + .addReg(FrameReg); } else { if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - Register ScaledReg = - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg, RegState::Kill); + .addReg(FrameReg); const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; @@ -1148,10 +1381,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // unavailable. Only one additional mov is needed. Register TmpScaledReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) @@ -1165,19 +1398,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); } } } - if (!TmpDiffReg.isValid()) { - // Restore the FP. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1192,10 +1418,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == - MFI->getStackPtrOffsetReg()); - - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); + auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); + assert((SOffset.isReg() && + SOffset.getReg() == MFI->getStackPtrOffsetReg()) || + (SOffset.isImm() && SOffset.getImm() == 0)); + if (SOffset.isReg()) { + if (FrameReg == AMDGPU::NoRegister) { + SOffset.ChangeToImmediate(0); + } else { + SOffset.setReg(FrameReg); + } + } int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm @@ -1224,16 +1457,99 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const { +StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { return AMDGPUInstPrinter::getRegisterName(Reg); } +const TargetRegisterClass * +SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth == 1) + return &AMDGPU::VReg_1RegClass; + if (BitWidth <= 16) + return &AMDGPU::VGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::VGPR_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::VReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::VReg_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::VReg_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::VReg_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::VReg_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::VReg_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::VReg_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::VReg_1024RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 16) + return &AMDGPU::AGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AGPR_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::AReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::AReg_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::AReg_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::AReg_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::AReg_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::AReg_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::AReg_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AReg_1024RegClass; + + return nullptr; +} + +const TargetRegisterClass * +SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { + if (BitWidth <= 16) + return &AMDGPU::SGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::SReg_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::SReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::SGPR_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::SGPR_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::SGPR_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::SGPR_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::SGPR_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::SGPR_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::SGPR_1024RegClass; + + return nullptr; +} + // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. -const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!Register::isVirtualRegister(Reg)); - +const TargetRegisterClass * +SIRegisterInfo::getPhysRegClass(MCRegister Reg) const { static const TargetRegisterClass *const BaseClasses[] = { + &AMDGPU::VGPR_LO16RegClass, + &AMDGPU::VGPR_HI16RegClass, + &AMDGPU::SReg_LO16RegClass, + &AMDGPU::AGPR_LO16RegClass, &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::AGPR_32RegClass, @@ -1242,13 +1558,19 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::AReg_64RegClass, &AMDGPU::VReg_96RegClass, &AMDGPU::SReg_96RegClass, + &AMDGPU::AReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, &AMDGPU::AReg_128RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, + &AMDGPU::AReg_160RegClass, + &AMDGPU::VReg_192RegClass, + &AMDGPU::SReg_192RegClass, + &AMDGPU::AReg_192RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, + &AMDGPU::AReg_256RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, &AMDGPU::AReg_512RegClass, @@ -1272,122 +1594,54 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); - switch (Size) { - case 32: - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; - case 64: - return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; - case 96: - return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; - case 128: - return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; - case 160: - return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr; - case 256: - return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; - case 512: - return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; - case 1024: - return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; - case 1: - return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr; - default: + if (Size == 16) { + return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr || + getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr; + } + const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); + if (!VRC) { assert(Size < 32 && "Invalid register class size"); return false; } + return getCommonSubClass(VRC, RC) != nullptr; } bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); - if (Size < 32) + if (Size < 16) return false; - switch (Size) { - case 32: - return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; - case 64: - return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; - case 96: + const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); + if (!ARC) { + assert(getVGPRClassForBitWidth(Size) && "Invalid register class size"); return false; - case 128: - return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; - case 160: - case 256: - return false; - case 512: - return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; - case 1024: - return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; - default: - llvm_unreachable("Invalid register class size"); } + return getCommonSubClass(ARC, RC) != nullptr; } -const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const { - switch (getRegSizeInBits(*SRC)) { - case 32: - return &AMDGPU::VGPR_32RegClass; - case 64: - return &AMDGPU::VReg_64RegClass; - case 96: - return &AMDGPU::VReg_96RegClass; - case 128: - return &AMDGPU::VReg_128RegClass; - case 160: - return &AMDGPU::VReg_160RegClass; - case 256: - return &AMDGPU::VReg_256RegClass; - case 512: - return &AMDGPU::VReg_512RegClass; - case 1024: - return &AMDGPU::VReg_1024RegClass; - case 1: - return &AMDGPU::VReg_1RegClass; - default: - llvm_unreachable("Invalid register class size"); - } +const TargetRegisterClass * +SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const { + unsigned Size = getRegSizeInBits(*SRC); + const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size); + assert(VRC && "Invalid register class size"); + return VRC; } -const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( - const TargetRegisterClass *SRC) const { - switch (getRegSizeInBits(*SRC)) { - case 32: - return &AMDGPU::AGPR_32RegClass; - case 64: - return &AMDGPU::AReg_64RegClass; - case 128: - return &AMDGPU::AReg_128RegClass; - case 512: - return &AMDGPU::AReg_512RegClass; - case 1024: - return &AMDGPU::AReg_1024RegClass; - default: - llvm_unreachable("Invalid register class size"); - } +const TargetRegisterClass * +SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const { + unsigned Size = getRegSizeInBits(*SRC); + const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size); + assert(ARC && "Invalid register class size"); + return ARC; } -const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( - const TargetRegisterClass *VRC) const { - switch (getRegSizeInBits(*VRC)) { - case 32: +const TargetRegisterClass * +SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const { + unsigned Size = getRegSizeInBits(*VRC); + if (Size == 32) return &AMDGPU::SGPR_32RegClass; - case 64: - return &AMDGPU::SReg_64RegClass; - case 96: - return &AMDGPU::SReg_96RegClass; - case 128: - return &AMDGPU::SGPR_128RegClass; - case 160: - return &AMDGPU::SReg_160RegClass; - case 256: - return &AMDGPU::SReg_256RegClass; - case 512: - return &AMDGPU::SReg_512RegClass; - case 1024: - return &AMDGPU::SReg_1024RegClass; - default: - llvm_unreachable("Invalid register class size"); - } + const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size); + assert(SRC && "Invalid register class size"); + return SRC; } const TargetRegisterClass *SIRegisterInfo::getSubRegClass( @@ -1396,62 +1650,19 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return RC; // We can assume that each lane corresponds to one 32-bit register. - unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); + unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32; if (isSGPRClass(RC)) { - switch (Count) { - case 1: - return &AMDGPU::SGPR_32RegClass; - case 2: - return &AMDGPU::SReg_64RegClass; - case 3: - return &AMDGPU::SReg_96RegClass; - case 4: - return &AMDGPU::SGPR_128RegClass; - case 5: - return &AMDGPU::SReg_160RegClass; - case 8: - return &AMDGPU::SReg_256RegClass; - case 16: - return &AMDGPU::SReg_512RegClass; - case 32: /* fall-through */ - default: - llvm_unreachable("Invalid sub-register class size"); - } + if (Size == 32) + RC = &AMDGPU::SGPR_32RegClass; + else + RC = getSGPRClassForBitWidth(Size); } else if (hasAGPRs(RC)) { - switch (Count) { - case 1: - return &AMDGPU::AGPR_32RegClass; - case 2: - return &AMDGPU::AReg_64RegClass; - case 4: - return &AMDGPU::AReg_128RegClass; - case 16: - return &AMDGPU::AReg_512RegClass; - case 32: /* fall-through */ - default: - llvm_unreachable("Invalid sub-register class size"); - } + RC = getAGPRClassForBitWidth(Size); } else { - switch (Count) { - case 1: - return &AMDGPU::VGPR_32RegClass; - case 2: - return &AMDGPU::VReg_64RegClass; - case 3: - return &AMDGPU::VReg_96RegClass; - case 4: - return &AMDGPU::VReg_128RegClass; - case 5: - return &AMDGPU::VReg_160RegClass; - case 8: - return &AMDGPU::VReg_256RegClass; - case 16: - return &AMDGPU::VReg_512RegClass; - case 32: /* fall-through */ - default: - llvm_unreachable("Invalid sub-register class size"); - } + RC = getVGPRClassForBitWidth(Size); } + assert(RC && "Invalid sub-register class size"); + return RC; } bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { @@ -1487,215 +1698,60 @@ bool SIRegisterInfo::shouldRewriteCopySrc( return getCommonSubClass(DefRC, SrcRC) != nullptr; } -/// Returns a register that is not used at any point in the function. +/// Returns a lowest register that is not used at any point in the function. /// If all registers are used, then this function will return -// AMDGPU::NoRegister. -unsigned -SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineFunction &MF) const { - - for (unsigned Reg : *RC) - if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) - return Reg; - return AMDGPU::NoRegister; +/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return +/// highest unused register. +MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineFunction &MF, + bool ReserveHighestVGPR) const { + if (ReserveHighestVGPR) { + for (MCRegister Reg : reverse(*RC)) + if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) + return Reg; + } else { + for (MCRegister Reg : *RC) + if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) + return Reg; + } + return MCRegister(); } ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { - if (EltSize == 4) { - static const int16_t Sub0_31[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, - AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, - AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, - }; - - static const int16_t Sub0_15[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - }; - - static const int16_t Sub0_7[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - }; - - static const int16_t Sub0_4[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - }; - - static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - }; - - static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, - }; - - static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, - }; - - switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 32: - return {}; - case 64: - return makeArrayRef(Sub0_1); - case 96: - return makeArrayRef(Sub0_2); - case 128: - return makeArrayRef(Sub0_3); - case 160: - return makeArrayRef(Sub0_4); - case 256: - return makeArrayRef(Sub0_7); - case 512: - return makeArrayRef(Sub0_15); - case 1024: - return makeArrayRef(Sub0_31); - default: - llvm_unreachable("unhandled register size"); - } - } - - if (EltSize == 8) { - static const int16_t Sub0_31_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, - AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, - AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, - AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, - AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, - AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 - }; - - static const int16_t Sub0_15_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, - AMDGPU::sub12_sub13, AMDGPU::sub14_sub15 - }; - - static const int16_t Sub0_7_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 - }; - - - static const int16_t Sub0_3_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 - }; - - switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 64: - return {}; - case 128: - return makeArrayRef(Sub0_3_64); - case 256: - return makeArrayRef(Sub0_7_64); - case 512: - return makeArrayRef(Sub0_15_64); - case 1024: - return makeArrayRef(Sub0_31_64); - default: - llvm_unreachable("unhandled register size"); - } - } + const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); + assert(RegBitWidth >= 32 && RegBitWidth <= 1024); - if (EltSize == 16) { - - static const int16_t Sub0_31_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11, - AMDGPU::sub12_sub13_sub14_sub15, - AMDGPU::sub16_sub17_sub18_sub19, - AMDGPU::sub20_sub21_sub22_sub23, - AMDGPU::sub24_sub25_sub26_sub27, - AMDGPU::sub28_sub29_sub30_sub31 - }; - - static const int16_t Sub0_15_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11, - AMDGPU::sub12_sub13_sub14_sub15 - }; - - static const int16_t Sub0_7_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7 - }; - - switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 128: - return {}; - case 256: - return makeArrayRef(Sub0_7_128); - case 512: - return makeArrayRef(Sub0_15_128); - case 1024: - return makeArrayRef(Sub0_31_128); - default: - llvm_unreachable("unhandled register size"); - } - } - - assert(EltSize == 32 && "unhandled elt size"); - - static const int16_t Sub0_31_256[] = { - AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, - AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, - AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 - }; + const unsigned RegDWORDs = RegBitWidth / 32; + const unsigned EltDWORDs = EltSize / 4; + assert(RegSplitParts.size() + 1 >= EltDWORDs); - static const int16_t Sub0_15_256[] = { - AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 - }; + const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1]; + const unsigned NumParts = RegDWORDs / EltDWORDs; - switch (AMDGPU::getRegBitWidth(*RC->MC)) { - case 256: - return {}; - case 512: - return makeArrayRef(Sub0_15_256); - case 1024: - return makeArrayRef(Sub0_31_256); - default: - llvm_unreachable("unhandled register size"); - } + return makeArrayRef(Parts.data(), NumParts); } const TargetRegisterClass* SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, - unsigned Reg) const { - if (Register::isVirtualRegister(Reg)) - return MRI.getRegClass(Reg); - - return getPhysRegClass(Reg); + Register Reg) const { + return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg); } bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, - unsigned Reg) const { - const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); - assert(RC && "Register class for the reg not found"); - return hasVGPRs(RC); + Register Reg) const { + const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); + // Registers without classes are unaddressable, SGPR-like registers. + return RC && hasVGPRs(RC); } bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, - unsigned Reg) const { - const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); - assert(RC && "Register class for the reg not found"); - return hasAGPRs(RC); + Register Reg) const { + const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg); + + // Registers without classes are unaddressable, SGPR-like registers. + return RC && hasAGPRs(RC); } bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, @@ -1727,36 +1783,41 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MF.getFunction()); switch (RC->getID()) { default: - return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); + return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); case AMDGPU::VGPR_32RegClassID: + case AMDGPU::VGPR_LO16RegClassID: + case AMDGPU::VGPR_HI16RegClassID: return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF)); case AMDGPU::SGPR_32RegClassID: + case AMDGPU::SGPR_LO16RegClassID: return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF)); } } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) + if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 || + Idx == AMDGPU::RegisterPressureSets::AGPR_32) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast<MachineFunction &>(MF)); - if (Idx == getSGPRPressureSet()) + if (Idx == AMDGPU::RegisterPressureSets::SReg_32) return getRegPressureLimit(&AMDGPU::SGPR_32RegClass, const_cast<MachineFunction &>(MF)); - return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx); + llvm_unreachable("Unexpected register pressure set!"); } const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { static const int Empty[] = { -1 }; - if (hasRegUnit(AMDGPU::M0, RegUnit)) + if (RegPressureIgnoredUnits[RegUnit]) return Empty; - return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); + + return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit); } -unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { +MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { // Not a callee saved register. return AMDGPU::SGPR30_SGPR31; } @@ -1765,49 +1826,19 @@ const TargetRegisterClass * SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, const RegisterBank &RB, const MachineRegisterInfo &MRI) const { - switch (Size) { - case 1: { - switch (RB.getID()) { - case AMDGPU::VGPRRegBankID: - return &AMDGPU::VGPR_32RegClass; - case AMDGPU::VCCRegBankID: - return isWave32 ? - &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; - case AMDGPU::SGPRRegBankID: - return &AMDGPU::SReg_32RegClass; - default: - llvm_unreachable("unknown register bank"); - } - } - case 32: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32RegClass; - case 64: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : - &AMDGPU::SReg_64RegClass; - case 96: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : - &AMDGPU::SReg_96RegClass; - case 128: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : - &AMDGPU::SGPR_128RegClass; - case 160: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass : - &AMDGPU::SReg_160RegClass; - case 256: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass : - &AMDGPU::SReg_256RegClass; - case 512: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass : - &AMDGPU::SReg_512RegClass; - case 1024: - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass : - &AMDGPU::SReg_1024RegClass; + switch (RB.getID()) { + case AMDGPU::VGPRRegBankID: + return getVGPRClassForBitWidth(std::max(32u, Size)); + case AMDGPU::VCCRegBankID: + assert(Size == 1); + return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass + : &AMDGPU::SReg_64_XEXECRegClass; + case AMDGPU::SGPRRegBankID: + return getSGPRClassForBitWidth(std::max(32u, Size)); + case AMDGPU::AGPRRegBankID: + return getAGPRClassForBitWidth(std::max(32u, Size)); default: - if (Size < 32) - return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : - &AMDGPU::SReg_32RegClass; - return nullptr; + llvm_unreachable("unknown register bank"); } } @@ -1822,7 +1853,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, return getAllocatableClass(RC); } -unsigned SIRegisterInfo::getVCC() const { +MCRegister SIRegisterInfo::getVCC() const { return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } @@ -1837,12 +1868,12 @@ SIRegisterInfo::getRegClass(unsigned RCID) const { case -1: return nullptr; default: - return AMDGPURegisterInfo::getRegClass(RCID); + return AMDGPUGenRegisterInfo::getRegClass(RCID); } } // Find reaching register definition -MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, +MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const { @@ -1850,7 +1881,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; - if (Register::isVirtualRegister(Reg)) { + if (Reg.isVirtual()) { if (!LIS->hasInterval(Reg)) return nullptr; LiveInterval &LI = LIS->getInterval(Reg); @@ -1894,3 +1925,49 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, return Def; } + +MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const { + assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32); + + for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass, + AMDGPU::SReg_32RegClass, + AMDGPU::AGPR_32RegClass } ) { + if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC)) + return Super; + } + if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16, + &AMDGPU::VGPR_32RegClass)) { + return Super; + } + + return AMDGPU::NoRegister; +} + +bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { + switch (PhysReg) { + case AMDGPU::SGPR_NULL: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_LIMIT: + return true; + default: + return false; + } +} + +ArrayRef<MCPhysReg> +SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const { + return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), + ST.getMaxNumSGPRs(MF) / 4); +} + +ArrayRef<MCPhysReg> +SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); +} + +ArrayRef<MCPhysReg> +SIRegisterInfo::getAllVGPR32(const MachineFunction &MF) const { + return makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), ST.getMaxNumVGPRs(MF)); +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index ac8c56fa3a038..62d9f1174337b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -14,7 +14,9 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H -#include "AMDGPURegisterInfo.h" +#define GET_REGINFO_HEADER +#include "AMDGPUGenRegisterInfo.inc" + #include "SIDefines.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -22,38 +24,38 @@ namespace llvm { class GCNSubtarget; class LiveIntervals; -class MachineRegisterInfo; class SIMachineFunctionInfo; -class SIRegisterInfo final : public AMDGPURegisterInfo { +class SIRegisterInfo final : public AMDGPUGenRegisterInfo { private: const GCNSubtarget &ST; - unsigned SGPRSetID; - unsigned VGPRSetID; - unsigned AGPRSetID; - BitVector SGPRPressureSets; - BitVector VGPRPressureSets; - BitVector AGPRPressureSets; bool SpillSGPRToVGPR; bool isWave32; + BitVector RegPressureIgnoredUnits; + + /// Sub reg indexes for getRegSplitParts. + /// First index represents subreg size from 1 to 16 DWORDs. + /// The inner vector is sorted by bit offset. + /// Provided a register can be fully split with given subregs, + /// all elements of the inner vector combined give a full lane mask. + static std::array<std::vector<int16_t>, 16> RegSplitParts; + + void reserveRegisterTuples(BitVector &, MCRegister Reg) const; - void classifyPressureSet(unsigned PSetID, unsigned Reg, - BitVector &PressureSets) const; public: SIRegisterInfo(const GCNSubtarget &ST); + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) + static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); + bool spillSGPRToVGPR() const { return SpillSGPRToVGPR; } /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. - unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - - /// Return the end register initially reserved for the scratch wave offset in - /// case spilling is needed. - unsigned reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const; + MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; BitVector getReservedRegs(const MachineFunction &MF) const override; @@ -70,6 +72,9 @@ public: Register getFrameRegister(const MachineFunction &MF) const override; + bool hasBasePointer(const MachineFunction &MF) const; + Register getBaseRegister() const; + bool canRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -77,7 +82,6 @@ public: bool requiresFrameIndexReplacementScavenging( const MachineFunction &MF) const override; bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; - bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; int64_t getMUBUFInstrOffset(const MachineInstr *MI) const; @@ -86,19 +90,24 @@ public: bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, int FrameIdx, + void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg, + int FrameIdx, int64_t Offset) const override; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override; const TargetRegisterClass *getPointerRegClass( const MachineFunction &MF, unsigned Kind = 0) const override; + void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index, + int Offset, unsigned EltSize, Register VGPR, + int64_t VGPRLanes, RegScavenger *RS, + bool IsLoad) const; + /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, @@ -115,15 +124,19 @@ public: bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS) const; - StringRef getRegAsmName(unsigned Reg) const override; + StringRef getRegAsmName(MCRegister Reg) const override; - unsigned getHWRegIndex(unsigned Reg) const { + unsigned getHWRegIndex(MCRegister Reg) const { return getEncodingValue(Reg) & 0xff; } + static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth); + static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth); + static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); + /// Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. - const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; + const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const; /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { @@ -135,9 +148,9 @@ public: return isSGPRClass(getRegClass(RCID)); } - bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const { const TargetRegisterClass *RC; - if (Register::isVirtualRegister(Reg)) + if (Reg.isVirtual()) RC = MRI.getRegClass(Reg); else RC = getPhysRegClass(Reg); @@ -161,16 +174,16 @@ public: } /// \returns A VGPR reg class with the same width as \p SRC - const TargetRegisterClass *getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const; + const TargetRegisterClass * + getEquivalentVGPRClass(const TargetRegisterClass *SRC) const; /// \returns An AGPR reg class with the same width as \p SRC - const TargetRegisterClass *getEquivalentAGPRClass( - const TargetRegisterClass *SRC) const; + const TargetRegisterClass * + getEquivalentAGPRClass(const TargetRegisterClass *SRC) const; /// \returns A SGPR reg class with the same width as \p SRC - const TargetRegisterClass *getEquivalentSGPRClass( - const TargetRegisterClass *VRC) const; + const TargetRegisterClass * + getEquivalentSGPRClass(const TargetRegisterClass *VRC) const; /// \returns The register class that is used for a sub-register of \p RC for /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will @@ -196,38 +209,23 @@ public: /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineFunction &MF) const; - - unsigned getSGPRPressureSet() const { return SGPRSetID; }; - unsigned getVGPRPressureSet() const { return VGPRSetID; }; - unsigned getAGPRPressureSet() const { return AGPRSetID; }; + MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineFunction &MF, + bool ReserveHighestVGPR = false) const; const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, - unsigned Reg) const; - bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; - bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; - bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const { + Register Reg) const; + bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const; + bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const; + bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const { return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } - virtual bool - isDivergentRegClass(const TargetRegisterClass *RC) const override { - return !isSGPRClass(RC); - } + bool isConstantPhysReg(MCRegister PhysReg) const override; - bool isSGPRPressureSet(unsigned SetID) const { - return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) && - !AGPRPressureSets.test(SetID); - } - bool isVGPRPressureSet(unsigned SetID) const { - return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && - !AGPRPressureSets.test(SetID); - } - bool isAGPRPressureSet(unsigned SetID) const { - return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && - !VGPRPressureSets.test(SetID); + bool isDivergentRegClass(const TargetRegisterClass *RC) const override { + return !isSGPRClass(RC); } ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, @@ -249,7 +247,7 @@ public: const int *getRegUnitPressureSets(unsigned RegUnit) const override; - unsigned getReturnAddressReg(const MachineFunction &MF) const; + MCRegister getReturnAddressReg(const MachineFunction &MF) const; const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, @@ -277,12 +275,12 @@ public: : &AMDGPU::SReg_64_XEXECRegClass; } - unsigned getVCC() const; + MCRegister getVCC() const; const TargetRegisterClass *getRegClass(unsigned RCID) const; // Find reaching register definition - MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg, + MachineInstr *findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const; @@ -290,14 +288,51 @@ public: const uint32_t *getAllVGPRRegMask() const; const uint32_t *getAllAllocatableSRegMask() const; + // \returns number of 32 bit registers covered by a \p LM + static unsigned getNumCoveredRegs(LaneBitmask LM) { + // The assumption is that every lo16 subreg is an even bit and every hi16 + // is an adjacent odd bit or vice versa. + uint64_t Mask = LM.getAsInteger(); + uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL; + Mask = (Even >> 1) | Mask; + uint64_t Odd = Mask & 0x5555555555555555ULL; + return countPopulation(Odd); + } + + // \returns a DWORD offset of a \p SubReg + unsigned getChannelFromSubReg(unsigned SubReg) const { + return SubReg ? (getSubRegIdxOffset(SubReg) + 31) / 32 : 0; + } + + // \returns a DWORD size of a \p SubReg + unsigned getNumChannelsFromSubReg(unsigned SubReg) const { + return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg)); + } + + // For a given 16 bit \p Reg \returns a 32 bit register holding it. + // \returns \p Reg otherwise. + MCPhysReg get32BitRegister(MCPhysReg Reg) const; + + /// Return all SGPR128 which satisfy the waves per execution unit requirement + /// of the subtarget. + ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const; + + /// Return all SGPR32 which satisfy the waves per execution unit requirement + /// of the subtarget. + ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const; + + /// Return all VGPR32 which satisfy the waves per execution unit requirement + /// of the subtarget. + ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index, - unsigned ValueReg, + Register ValueReg, bool ValueIsKill, - unsigned ScratchRsrcReg, - unsigned ScratchOffsetReg, + MCRegister ScratchRsrcReg, + MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 6ea6ec00e742d..ff1f5c4bc49b1 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -7,6 +7,50 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// Subregister declarations +//===----------------------------------------------------------------------===// + +class Indexes<int N> { + list<int> all = [0, 1, 2, 3, 4, 5, 6 , 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31]; + + // Returns list of indexes [0..N) + list<int> slice = + !foldl([]<int>, all, acc, cur, + !listconcat(acc, !if(!lt(cur, N), [cur], []))); +} + +let Namespace = "AMDGPU" in { + +def lo16 : SubRegIndex<16, 0>; +def hi16 : SubRegIndex<16, 16>; + +foreach Index = 0-31 in { + def sub#Index : SubRegIndex<32, !shl(Index, 5)>; +} + +foreach Index = 1-31 in { + def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>; + def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>; +} + +foreach Size = {2-6,8,16} in { + foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in { + def !foldl("", Indexes<Size>.slice, acc, cur, + !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) : + SubRegIndex<!mul(Size, 32), !shl(Index, 5)> { + let CoveringSubRegIndices = + !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur, + !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))])); + } + } +} + +} + +//===----------------------------------------------------------------------===// // Helpers //===----------------------------------------------------------------------===// @@ -15,6 +59,7 @@ class getSubRegs<int size> { list<SubRegIndex> ret3 = [sub0, sub1, sub2]; list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3]; list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4]; + list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, @@ -33,8 +78,10 @@ class getSubRegs<int size> { !if(!eq(size, 3), ret3, !if(!eq(size, 4), ret4, !if(!eq(size, 5), ret5, - !if(!eq(size, 8), ret8, - !if(!eq(size, 16), ret16, ret32)))))); + !if(!eq(size, 6), ret6, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, + ret32))))))); } // Generates list of sequential register tuple names. @@ -74,39 +121,69 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC, // Declarations that describe the SI registers //===----------------------------------------------------------------------===// class SIReg <string n, bits<16> regIdx = 0> : - Register<n>, - DwarfRegNum<[!cast<int>(HWEncoding)]> { + Register<n> { let Namespace = "AMDGPU"; - - // This is the not yet the complete register encoding. An additional - // bit is set for VGPRs. let HWEncoding = regIdx; } +class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx> : + RegisterWithSubRegs<n, subregs> { +} + +multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1, + bit HWEncodingHigh = 0> { + // There is no special encoding for 16 bit subregs, these are not real + // registers but rather operands for instructions preserving other 16 bits + // of the result or reading just 16 bits of a 32 bit VGPR. + // It is encoded as a corresponding 32 bit register. + // Non-VGPR register classes use it as we need to have matching subregisters + // to move instructions and data between ALUs. + def _LO16 : SIReg<n#".l", regIdx> { + let HWEncoding{8} = HWEncodingHigh; + } + def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx> { + let isArtificial = ArtificialHigh; + let HWEncoding{8} = HWEncodingHigh; + } + def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"), + !cast<Register>(NAME#"_HI16")]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [lo16, hi16]; + let CoveredBySubRegs = !if(ArtificialHigh,0,1); + let HWEncoding = regIdx; + let HWEncoding{8} = HWEncodingHigh; + } +} + // Special Registers -def VCC_LO : SIReg<"vcc_lo", 106>; -def VCC_HI : SIReg<"vcc_hi", 107>; +defm VCC_LO : SIRegLoHi16<"vcc_lo", 106>; +defm VCC_HI : SIRegLoHi16<"vcc_hi", 107>; // Pseudo-registers: Used as placeholders during isel and immediately // replaced, never seeing the verifier. def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; def FP_REG : SIReg<"fp", 0>; def SP_REG : SIReg<"sp", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; + +// Pseudo-register to represent the program-counter DWARF register. +def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> { + // There is no physical register corresponding to a "program counter", but + // we need to encode the concept in debug information in order to represent + // things like the return value in unwind information. + let isArtificial = 1; +} // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, - DwarfRegAlias<VCC_LO> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; } -def EXEC_LO : SIReg<"exec_lo", 126>; -def EXEC_HI : SIReg<"exec_hi", 127>; +defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>; +defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, - DwarfRegAlias<EXEC_LO> { +def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; @@ -114,71 +191,76 @@ def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, // 32-bit real registers, for MC only. // May be used with both 32-bit and 64-bit operands. -def SRC_VCCZ : SIReg<"src_vccz", 251>; -def SRC_EXECZ : SIReg<"src_execz", 252>; -def SRC_SCC : SIReg<"src_scc", 253>; +defm SRC_VCCZ : SIRegLoHi16<"src_vccz", 251>; +defm SRC_EXECZ : SIRegLoHi16<"src_execz", 252>; +defm SRC_SCC : SIRegLoHi16<"src_scc", 253>; // 1-bit pseudo register, for codegen only. // Should never be emitted. def SCC : SIReg<"scc">; -def M0 : SIReg <"m0", 124>; -def SGPR_NULL : SIReg<"null", 125>; +defm M0 : SIRegLoHi16 <"m0", 124>; +defm SGPR_NULL : SIRegLoHi16 <"null", 125>; -def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>; -def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; -def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; -def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; -def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>; +defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>; +defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>; +defm SRC_PRIVATE_BASE : SIRegLoHi16<"src_private_base", 237>; +defm SRC_PRIVATE_LIMIT : SIRegLoHi16<"src_private_limit", 238>; +defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>; -def LDS_DIRECT : SIReg <"src_lds_direct", 254>; +// Not addressable +def MODE : SIReg <"mode", 0>; -def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; -def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; +def LDS_DIRECT : SIReg <"src_lds_direct", 254> { + // There is no physical register corresponding to this. This is an + // encoding value in a source field, which will ultimately trigger a + // read from m0. + let isArtificial = 1; +} -def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, - DwarfRegAlias<XNACK_MASK_LO> { +defm XNACK_MASK_LO : SIRegLoHi16<"xnack_mask_lo", 104>; +defm XNACK_MASK_HI : SIRegLoHi16<"xnack_mask_hi", 105>; + +def XNACK_MASK : + RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 104; } // Trap handler registers -def TBA_LO : SIReg<"tba_lo", 108>; -def TBA_HI : SIReg<"tba_hi", 109>; +defm TBA_LO : SIRegLoHi16<"tba_lo", 108>; +defm TBA_HI : SIRegLoHi16<"tba_hi", 109>; -def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, - DwarfRegAlias<TBA_LO> { +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 108; } -def TMA_LO : SIReg<"tma_lo", 110>; -def TMA_HI : SIReg<"tma_hi", 111>; +defm TMA_LO : SIRegLoHi16<"tma_lo", 110>; +defm TMA_HI : SIRegLoHi16<"tma_hi", 111>; -def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, - DwarfRegAlias<TMA_LO> { +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 110; } foreach Index = 0-15 in { - def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; - def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; - def TTMP#Index : SIReg<"ttmp"#Index, 0>; + defm TTMP#Index#_vi : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>; + defm TTMP#Index#_gfx9_gfx10 : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>; + defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>; } multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { - def _ci : SIReg<n, ci_e>; - def _vi : SIReg<n, vi_e>; - def "" : SIReg<n, 0>; + defm _ci : SIRegLoHi16<n, ci_e>; + defm _vi : SIRegLoHi16<n, vi_e>; + defm "" : SIRegLoHi16<n, 0>; } class FlatReg <Register lo, Register hi, bits<16> encoding> : - RegisterWithSubRegs<"flat_scratch", [lo, hi]>, - DwarfRegAlias<lo> { + RegisterWithSubRegs<"flat_scratch", [lo, hi]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = encoding; @@ -193,21 +275,24 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; // SGPR registers foreach Index = 0-105 in { - def SGPR#Index : SIReg <"s"#Index, Index>; + defm SGPR#Index : + SIRegLoHi16 <"s"#Index, Index>, + DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)), + !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>; } // VGPR registers foreach Index = 0-255 in { - def VGPR#Index : SIReg <"v"#Index, Index> { - let HWEncoding{8} = 1; - } + defm VGPR#Index : + SIRegLoHi16 <"v"#Index, Index, 0, 1>, + DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>; } // AccVGPR registers foreach Index = 0-255 in { - def AGPR#Index : SIReg <"a"#Index, Index> { - let HWEncoding{8} = 1; - } + defm AGPR#Index : + SIRegLoHi16 <"a"#Index, Index, 1, 1>, + DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>; } //===----------------------------------------------------------------------===// @@ -224,14 +309,35 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> { let isAllocatable = 0; } +def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> { + let CopyCost = 1; + let Size = 16; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? +def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, + (add (sequence "SGPR%u_LO16", 0, 105))> { + let AllocationPriority = 9; + let Size = 16; + let GeneratePressureSet = 0; +} + +def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16, + (add (sequence "SGPR%u_HI16", 0, 105))> { + let isAllocatable = 0; + let Size = 16; + let GeneratePressureSet = 0; +} + // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "SGPR%u", 0, 105))> { // Give all SGPR classes higher priority than VGPR classes, because // we want to spill SGPRs to VGPRs. let AllocationPriority = 9; + let GeneratePressureSet = 0; } // SGPR 64-bit registers @@ -246,6 +352,9 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">; // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">; +// SGPR 192-bit registers +def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">; + // SGPR 256-bit registers def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">; @@ -261,6 +370,12 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, let isAllocatable = 0; } +def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, + (add (sequence "TTMP%u_LO16", 0, 15))> { + let Size = 16; + let isAllocatable = 0; +} + // Trap handler TMP 64-bit registers def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">; @@ -357,6 +472,19 @@ class RegisterTypes<list<ValueType> reg_types> { def Reg16Types : RegisterTypes<[i16, f16]>; def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>; +def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16, + (add (sequence "VGPR%u_LO16", 0, 255))> { + let AllocationPriority = 1; + let Size = 16; + let GeneratePressureSet = 0; +} + +def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16, + (add (sequence "VGPR%u_HI16", 0, 255))> { + let AllocationPriority = 1; + let Size = 16; + let GeneratePressureSet = 0; +} // VGPR 32-bit registers // i16/f16 only on VI+ @@ -364,6 +492,7 @@ def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.t (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; + let Weight = 1; } // VGPR 64-bit registers @@ -378,6 +507,9 @@ def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">; // VGPR 160-bit registers def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">; +// VGPR 192-bit registers +def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">; + // VGPR 256-bit registers def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">; @@ -387,19 +519,39 @@ def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">; // VGPR 1024-bit registers def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">; +def AGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16, + (add (sequence "AGPR%u_LO16", 0, 255))> { + let isAllocatable = 0; + let Size = 16; + let GeneratePressureSet = 0; +} + // AccVGPR 32-bit registers def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add (sequence "AGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; + let Weight = 1; } // AGPR 64-bit registers def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">; +// AGPR 96-bit registers +def AGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, AGPR_32, 255, 1, 3, "a">; + // AGPR 128-bit registers def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">; +// AGPR 160-bit registers +def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">; + +// AGPR 192-bit registers +def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">; + +// AGPR 256-bit registers +def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">; + // AGPR 512-bit registers def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">; @@ -411,7 +563,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">; //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; } @@ -422,12 +574,13 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, let CopyCost = -1; } -def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add LDS_DIRECT)> { let isAllocatable = 0; let CopyCost = -1; } +let GeneratePressureSet = 0 in { // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, @@ -438,24 +591,54 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1 let AllocationPriority = 10; } +def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16, + (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16, + XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16, + TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16, + SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16, + SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> { + let Size = 16; + let AllocationPriority = 10; +} + def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { let AllocationPriority = 10; } +def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16, + (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> { + let Size = 16; + let AllocationPriority = 10; +} + def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 10; } +def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16, + (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> { + let Size = 16; + let AllocationPriority = 10; +} + +def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, + (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> { + let Size = 16; + let AllocationPriority = 10; +} +} // End GeneratePressureSet = 0 + // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 10; } -def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> { +let GeneratePressureSet = 0 in { +def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add SReg_32, LDS_DIRECT_CLASS)> { let isAllocatable = 0; } @@ -528,7 +711,6 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add SGPR_128, TTMP_128)> { - let AllocationPriority = 15; let isAllocatable = 0; } @@ -543,39 +725,50 @@ def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, (add SGPR_160)> { - let AllocationPriority = 16; + // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated + // subclasses of SGPR_160 to be marked unallocatable too. } -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { +def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> { + let Size = 192; let AllocationPriority = 17; } -def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { +def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> { + let Size = 192; + let isAllocatable = 0; +} + +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> { + let AllocationPriority = 18; +} + +def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> { let isAllocatable = 0; } -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 17; + let isAllocatable = 0; } -def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, (add SGPR_512Regs)> { - let AllocationPriority = 18; + let AllocationPriority = 19; } -def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, (add TTMP_512Regs)> { let isAllocatable = 0; } -def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 18; + let isAllocatable = 0; } def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -583,105 +776,55 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3 let isAllocatable = 0; } -def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, +def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, (add SGPR_1024Regs)> { - let AllocationPriority = 19; + let AllocationPriority = 20; } -def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, +def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, (add SGPR_1024)> { let CopyCost = 16; - let AllocationPriority = 19; -} - -// Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32, - (add VGPR_64)> { - let Size = 64; - - // Requires 2 v_mov_b32 to copy - let CopyCost = 2; - let AllocationPriority = 2; -} - -def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> { - let Size = 96; - - // Requires 3 v_mov_b32 to copy - let CopyCost = 3; - let AllocationPriority = 3; -} - -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add VGPR_128)> { - let Size = 128; - - // Requires 4 v_mov_b32 to copy - let CopyCost = 4; - let AllocationPriority = 4; -} - -def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, - (add VGPR_160)> { - let Size = 160; - - // Requires 5 v_mov_b32 to copy - let CopyCost = 5; - let AllocationPriority = 5; -} - -def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, - (add VGPR_256)> { - let Size = 256; - let CopyCost = 8; - let AllocationPriority = 6; -} - -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add VGPR_512)> { - let Size = 512; - let CopyCost = 16; - let AllocationPriority = 7; -} - -def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add VGPR_1024)> { - let Size = 1024; - let CopyCost = 32; - let AllocationPriority = 8; + let isAllocatable = 0; } -def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, - (add AGPR_64)> { - let Size = 64; +// Register class for all vector registers (VGPRs + Interpolation Registers) +class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> : + RegisterClass<"AMDGPU", regTypes, 32, regList> { + let Size = !mul(numRegs, 32); - let CopyCost = 5; - let AllocationPriority = 2; + // Requires n v_mov_b32 to copy + let CopyCost = numRegs; + let AllocationPriority = numRegs; + let Weight = numRegs; } -def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, - (add AGPR_128)> { - let Size = 128; +def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], + (add VGPR_64)>; +def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; +def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>; +def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; +def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; +def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; +def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; +def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; - // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr - let CopyCost = 9; - let AllocationPriority = 4; +class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> : + VRegClass<numRegs, regTypes, regList> { + // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr + let CopyCost = !add(numRegs, numRegs, 1); } -def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, - (add AGPR_512)> { - let Size = 512; - let CopyCost = 33; - let AllocationPriority = 7; -} - -def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, - (add AGPR_1024)> { - let Size = 1024; - let CopyCost = 65; - let AllocationPriority = 8; -} +def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], + (add AGPR_64)>; +def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; +def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; +def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; +def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; +def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; +def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; +} // End GeneratePressureSet = 0 // This is not a real register. This is just to have a register to add // to VReg_1 that does not alias any real register that would @@ -690,6 +833,7 @@ def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> { let isArtificial = 1; } +let GeneratePressureSet = 0 in { // FIXME: Should specify an empty set for this. No register should // ever be allocated using VReg_1. This is a hack for SelectionDAG // that should always be lowered by SILowerI1Copies. TableGen crashes @@ -718,6 +862,7 @@ def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, (add AReg_64, VReg_64)> { let isAllocatable = 0; } +} // End GeneratePressureSet = 0 //===----------------------------------------------------------------------===// // Register operands diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp index 51779e97ac620..64fca0b467977 100644 --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp @@ -88,15 +88,17 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch( for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + // leaving the loop might never be taken when EXEC = 0. + // Hence we should retain cbranch out of the loop lest it become infinite. + if (I->isConditionalBranch()) return true; if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; + if (TII->isKillTerminator(I->getOpcode())) + return true; + // These instructions are potentially expensive even if EXEC = 0. if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 824d1aeb0df9b..932381c99e0b0 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -1,4 +1,4 @@ -//===-- SISchedule.td - SI Scheduling definitons -------------------------===// +//===-- SISchedule.td - SI Scheduling definitions -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -27,10 +27,14 @@ def WriteBarrier : SchedWrite; def MIVGPRRead : SchedRead; def MIMFMARead : SchedRead; -// Vector ALU instructions +// Normal 16 or 32 bit VALU instructions def Write32Bit : SchedWrite; +// Conversion to or from F32 (but not converting F64 to or from F32) +def WriteFloatCvt : SchedWrite; +// F16 or F32 transcendental instructions (these are quarter rate) +def WriteTrans32 : SchedWrite; +// Other quarter rate VALU instructions def WriteQuarterRate32 : SchedWrite; -def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; @@ -43,6 +47,10 @@ def WriteDoubleAdd : SchedWrite; // Conversion to or from f64 instruction def WriteDoubleCvt : SchedWrite; +// F64 "transcendental" (actually only reciprocal and/or square root) +// instructions +def WriteTrans64 : SchedWrite; + // Half rate 64-bit instructions. def Write64Bit : SchedWrite; @@ -56,7 +64,7 @@ def Write16PassMAI : SchedWrite; // instructions) class SISchedMachineModel : SchedMachineModel { - let CompleteModel = 0; + let CompleteModel = 1; // MicroOpBufferSize = 1 means that instructions will always be added // the ready queue when they become available. This exposes them // to the register pressure analysis. @@ -127,6 +135,8 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<Write32Bit, 1>; def : HWVALUWriteRes<Write64Bit, 2>; + def : HWVALUWriteRes<WriteFloatCvt, 4>; + def : HWVALUWriteRes<WriteTrans32, 4>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; def : HWVALUWriteRes<Write2PassMAI, 2>; def : HWVALUWriteRes<Write8PassMAI, 8>; @@ -135,9 +145,9 @@ multiclass SICommonWriteRes { def : ReadAdvance<MIVGPRRead, -2>; def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>; - // Technicaly mfma reads can be from 0 to 4 cycles but that does not make + // Technically mfma reads can be from 0 to 4 cycles but that does not make // sense to model because its register setup is huge. In particular if we - // properly model read advanice as -2 for a vgpr read it will result in a + // properly model read advance as -2 for a vgpr read it will result in a // bad scheduling of acc writes before that mfma. To avoid it we would // need to consume 2 or 4 more vgprs to be initialized before the acc // write sequence. Just assume worst case here. @@ -163,6 +173,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>; def : HWVALUWriteRes<WriteDouble, 4>; def : HWVALUWriteRes<WriteDoubleAdd, 2>; def : HWVALUWriteRes<WriteDoubleCvt, 4>; +def : HWVALUWriteRes<WriteTrans64, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -176,6 +187,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>; def : HWVALUWriteRes<WriteDouble, 16>; def : HWVALUWriteRes<WriteDoubleAdd, 8>; def : HWVALUWriteRes<WriteDoubleCvt, 4>; +def : HWVALUWriteRes<WriteTrans64, 16>; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -186,17 +198,20 @@ let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). // Add 1 stall cycle for VGPR read. def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; -def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>; -def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; +def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>; +def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>; def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; -def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>; -def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>; -def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>; +def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>; +def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>; +def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>; def : HWWriteRes<WriteBranch, [HWBranch], 32>; def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; -def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>; +def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>; def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 3986ca6dfa813..9c6833a7dab61 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -185,6 +185,11 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { if (!MI.getOperand(0).isReg()) TII->commuteInstruction(MI, false, 0, 1); + // cmpk requires src0 to be a register + const MachineOperand &Src0 = MI.getOperand(0); + if (!Src0.isReg()) + return; + const MachineOperand &Src1 = MI.getOperand(1); if (!Src1.isImm()) return; @@ -220,7 +225,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) return; MachineFunction *MF = MI.getParent()->getParent(); @@ -323,60 +328,61 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST, MachineOperand *SrcReg = Src0; MachineOperand *SrcImm = Src1; - if (SrcImm->isImm() && - !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { - uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); - uint32_t NewImm = 0; - - if (Opc == AMDGPU::S_AND_B32) { - if (isPowerOf2_32(~Imm)) { - NewImm = countTrailingOnes(Imm); - Opc = AMDGPU::S_BITSET0_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { - NewImm = ~Imm; - Opc = AMDGPU::S_ANDN2_B32; - } - } else if (Opc == AMDGPU::S_OR_B32) { - if (isPowerOf2_32(Imm)) { - NewImm = countTrailingZeros(Imm); - Opc = AMDGPU::S_BITSET1_B32; - } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { - NewImm = ~Imm; - Opc = AMDGPU::S_ORN2_B32; - } - } else if (Opc == AMDGPU::S_XOR_B32) { - if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { - NewImm = ~Imm; - Opc = AMDGPU::S_XNOR_B32; - } - } else { - llvm_unreachable("unexpected opcode"); - } + if (!SrcImm->isImm() || + AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) + return false; + + uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); + uint32_t NewImm = 0; - if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && - SrcImm == Src0) { - if (!TII->commuteInstruction(MI, false, 1, 2)) - NewImm = 0; + if (Opc == AMDGPU::S_AND_B32) { + if (isPowerOf2_32(~Imm)) { + NewImm = countTrailingOnes(Imm); + Opc = AMDGPU::S_BITSET0_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ANDN2_B32; + } + } else if (Opc == AMDGPU::S_OR_B32) { + if (isPowerOf2_32(Imm)) { + NewImm = countTrailingZeros(Imm); + Opc = AMDGPU::S_BITSET1_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ORN2_B32; + } + } else if (Opc == AMDGPU::S_XOR_B32) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_XNOR_B32; } + } else { + llvm_unreachable("unexpected opcode"); + } - if (NewImm != 0) { - if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) { - MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); - MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); - return true; - } + if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && + SrcImm == Src0) { + if (!TII->commuteInstruction(MI, false, 1, 2)) + NewImm = 0; + } - if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { - MI.setDesc(TII->get(Opc)); - if (Opc == AMDGPU::S_BITSET0_B32 || - Opc == AMDGPU::S_BITSET1_B32) { - Src0->ChangeToImmediate(NewImm); - // Remove the immediate and add the tied input. - MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); - MI.tieOperands(0, 2); - } else { - SrcImm->setImm(NewImm); - } + if (NewImm != 0) { + if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + return true; + } + + if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { + MI.setDesc(TII->get(Opc)); + if (Opc == AMDGPU::S_BITSET0_B32 || + Opc == AMDGPU::S_BITSET1_B32) { + Src0->ChangeToImmediate(NewImm); + // Remove the immediate and add the tied input. + MI.getOperand(2).ChangeToRegister(Dest->getReg(), false); + MI.tieOperands(0, 2); + } else { + SrcImm->setImm(NewImm); } } } @@ -426,8 +432,7 @@ getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, if (Register::isPhysicalRegister(Reg)) { Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); } else { - LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); - Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); + Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); } } return TargetInstrInfo::RegSubRegPair(Reg, Sub); @@ -472,26 +477,30 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, if (!TRI.isVGPR(MRI, X)) return nullptr; - for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { - if (YTop.getSubReg() != Tsub) - continue; - - MachineInstr &MovY = *YTop.getParent(); - if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && - MovY.getOpcode() != AMDGPU::COPY) || - MovY.getOperand(1).getSubReg() != Tsub) + const unsigned SearchLimit = 16; + unsigned Count = 0; + for (auto Iter = std::next(MovT.getIterator()), + E = MovT.getParent()->instr_end(); + Iter != E && Count < SearchLimit; ++Iter, ++Count) { + + MachineInstr *MovY = &*Iter; + if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && + MovY->getOpcode() != AMDGPU::COPY) || + !MovY->getOperand(1).isReg() || + MovY->getOperand(1).getReg() != T || + MovY->getOperand(1).getSubReg() != Tsub) continue; - Register Y = MovY.getOperand(0).getReg(); - unsigned Ysub = MovY.getOperand(0).getSubReg(); + Register Y = MovY->getOperand(0).getReg(); + unsigned Ysub = MovY->getOperand(0).getSubReg(); - if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) + if (!TRI.isVGPR(MRI, Y)) continue; MachineInstr *MovX = nullptr; - auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); - for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { - if (instReadsReg(&*I, X, Xsub, TRI) || + for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); + I != IY; ++I) { + if (instReadsReg(&*I, X, Xsub, TRI) || instModifiesReg(&*I, Y, Ysub, TRI) || instModifiesReg(&*I, T, Tsub, TRI) || (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { @@ -516,7 +525,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, MovX = &*I; } - if (!MovX || I == E) + if (!MovX) continue; LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); @@ -533,7 +542,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, .addReg(X1.Reg, 0, X1.SubReg).getInstr(); } MovX->eraseFromParent(); - MovY.eraseFromParent(); + MovY->eraseFromParent(); MachineInstr *Next = &*std::next(MovT.getIterator()); if (MRI.use_nodbg_empty(T)) MovT.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 39f5df767977e..b1c73df269fb2 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -61,6 +61,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -154,7 +155,7 @@ private: LiveIntervals *LIS; DenseMap<const MachineInstr *, InstrInfo> Instructions; - DenseMap<MachineBasicBlock *, BlockInfo> Blocks; + MapVector<MachineBasicBlock *, BlockInfo> Blocks; SmallVector<MachineInstr *, 1> LiveMaskQueries; SmallVector<MachineInstr *, 4> LowerToMovInstrs; SmallVector<MachineInstr *, 4> LowerToCopyInstrs; @@ -170,8 +171,6 @@ private: void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); char analyzeFunction(MachineFunction &MF); - bool requiresCorrectState(const MachineInstr &MI) const; - MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before); MachineBasicBlock::iterator @@ -525,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { return GlobalFlags; } -/// Whether \p MI really requires the exec state computed during analysis. -/// -/// Scalar instructions must occasionally be marked WQM for correct propagation -/// (e.g. thread masks leading up to branches), but when it comes to actual -/// execution, they don't care about EXEC. -bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const { - if (MI.isTerminator()) - return true; - - // Skip instructions that are not affected by EXEC - if (TII->isScalarUnit(MI)) - return false; - - // Generic instructions such as COPY will either disappear by register - // coalescing or be lowered to SALU or VALU instructions. - if (MI.isTransient()) { - if (MI.getNumExplicitOperands() >= 1) { - const MachineOperand &Op = MI.getOperand(0); - if (Op.isReg()) { - if (TRI->isSGPRReg(*MRI, Op.getReg())) { - // SGPR instructions are not affected by EXEC - return false; - } - } - } - } - - return true; -} - MachineBasicBlock::iterator SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before) { @@ -741,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (II != IE) { MachineInstr &MI = *II; - if (requiresCorrectState(MI)) { + if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { if (III->second.Needs & StateWWM) @@ -793,18 +762,23 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (State == StateWWM) { assert(SavedNonWWMReg); fromWWM(MBB, Before, SavedNonWWMReg); + LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); + SavedNonWWMReg = 0; State = NonWWMState; } if (Needs == StateWWM) { NonWWMState = State; + assert(!SavedNonWWMReg); SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); toWWM(MBB, Before, SavedNonWWMReg); State = StateWWM; } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { - if (!WQMFromExec && (OutNeeds & StateWQM)) + if (!WQMFromExec && (OutNeeds & StateWQM)) { + assert(!SavedWQMReg); SavedWQMReg = MRI->createVirtualRegister(BoolRC); + } toExact(MBB, Before, SavedWQMReg, LiveMaskReg); State = StateExact; @@ -837,6 +811,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, break; II = Next; } + assert(!SavedWQMReg); + assert(!SavedNonWWMReg); } void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { @@ -929,10 +905,12 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. - BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ? - AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64), - Exec) - .addReg(Exec); + auto MI = BuildMI(Entry, EntryMI, DebugLoc(), + TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 + : AMDGPU::S_WQM_B64), + Exec) + .addReg(Exec); + LIS->InsertMachineInstrInMaps(*MI); lowerCopyInstrs(); // EntryMI may become invalid here @@ -948,6 +926,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { for (auto BII : Blocks) processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + if (LiveMaskReg) + LIS->createAndComputeVirtRegInterval(LiveMaskReg); + // Physical registers like SCC aren't tracked by default anyway, so just // removing the ranges we computed is the simplest option for maintaining // the analysis results. diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 79982d96c2c8e..70bf215c03f3f 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -1,4 +1,4 @@ -//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===// +//===---- SMInstructions.td - Scalar Memory Instruction Definitions -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,9 +11,11 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8", let OperandType = "OPERAND_IMMEDIATE"; } -def smrd_offset_20 : NamedOperandU32<"SMRDOffset20", - NamedMatchClass<"SMRDOffset20">> { +def smem_offset : NamedOperandU32<"SMEMOffset", + NamedMatchClass<"SMEMOffset">> { let OperandType = "OPERAND_IMMEDIATE"; + let EncoderMethod = "getSMEMOffsetEncoding"; + let DecoderMethod = "decodeSMEMOffset"; } //===----------------------------------------------------------------------===// @@ -43,6 +45,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt bit has_dlc = 0; bits<1> has_offset = 1; bits<1> offset_is_imm = 0; + bit is_buffer = 0; } class SM_Real <SM_Pseudo ps> @@ -51,9 +54,15 @@ class SM_Real <SM_Pseudo ps> let isPseudo = 0; let isCodeGenOnly = 0; + Instruction Opcode = !cast<Instruction>(NAME); + // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let SMRD = ps.SMRD; + + bit is_buffer = ps.is_buffer; // encoding bits<7> sbase; @@ -153,7 +162,7 @@ multiclass SM_Pseudo_Stores<string opName, } multiclass SM_Pseudo_Discards<string opName> { - def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>; + def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>; def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>; } @@ -178,14 +187,14 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo< opName, (outs), (ins), "", [(node)]> { let hasSideEffects = 1; - let mayStore = 1; + let mayStore = 0; let has_sdst = 0; let has_sbase = 0; let has_offset = 0; } multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> { - def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>; + def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>; def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>; } @@ -228,7 +237,7 @@ class SM_Pseudo_Atomic<string opName, SM_Atomic_Pseudo<opName, !if(isRet, (outs dataClass:$sdst), (outs)), !if(isImm, - (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc), + (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc), (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)), !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc", isRet> { @@ -266,6 +275,7 @@ defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>; +let is_buffer = 1 in { defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads < "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC >; @@ -287,12 +297,14 @@ defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads < defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads < "s_buffer_load_dwordx16", SReg_128, SReg_512 >; +} let SubtargetPredicate = HasScalarStores in { defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; +let is_buffer = 1 in { defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores < "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC >; @@ -304,8 +316,10 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < "s_buffer_store_dwordx4", SReg_128, SReg_128 >; +} } // End SubtargetPredicate = HasScalarStores +let SubtargetPredicate = HasSMemTimeInst in def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; @@ -321,13 +335,16 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>; defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>; +let is_buffer = 1 in { defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>; +} } // SubtargetPredicate = isGFX8Plus -let SubtargetPredicate = isGFX10Plus in { +let SubtargetPredicate = isGFX10Plus in def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">; +let SubtargetPredicate = HasGetWaveIdInst in def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>; -} // End SubtargetPredicate = isGFX10Plus + let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in { defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; @@ -341,6 +358,7 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg let SubtargetPredicate = HasScalarAtomics in { +let is_buffer = 1 in { defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>; defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>; defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>; @@ -368,6 +386,7 @@ defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>; defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>; defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>; +} defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>; defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>; @@ -481,14 +500,17 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> let Inst{17} = imm; let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding - let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?); + + // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed. + // Offset value is corrected accordingly when offset is encoded/decoded. + let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?); } multiclass SM_Real_Loads_vi<bits<8> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_vi : SMEM_Real_vi <op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_vi <op, sgprPs> { let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); @@ -509,7 +531,7 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_vi : SMEM_Real_Store_vi <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> { @@ -665,12 +687,10 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> : let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc); let LGKM_CNT = ps.LGKM_CNT; - let SMRD = ps.SMRD; let mayLoad = ps.mayLoad; let mayStore = ps.mayStore; let hasSideEffects = ps.hasSideEffects; let SchedRW = ps.SchedRW; - let UseNamedOperandTable = ps.UseNamedOperandTable; let Inst{7-0} = 0xff; let Inst{8} = 0; @@ -768,23 +788,26 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { multiclass SMLoad_Pattern <string Instr, ValueType vt> { // 1. Offset as an immediate def : GCNPat < - (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc), - (as_i1imm $dlc))) - >; + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy), + (extract_dlc $cachepolicy)))> { + let AddedComplexity = 2; + } // 2. 32-bit IMM offset on CI def : GCNPat < - (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)), - (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> { + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)), + (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset, + (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> { let OtherPredicates = [isGFX7Only]; + let AddedComplexity = 1; } // 3. Offset loaded in an 32bit SGPR def : GCNPat < - (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc), - (as_i1imm $dlc))) + (SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy), + (extract_dlc $cachepolicy))) >; } @@ -805,8 +828,13 @@ foreach vt = SReg_128.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; } -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; +foreach vt = SReg_256.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX8", vt>; +} + +foreach vt = SReg_512.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>; +} defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; @@ -821,10 +849,21 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 +let OtherPredicates = [HasSMemTimeInst] in { def : GCNPat < (i64 (readcyclecounter)), (S_MEMTIME) >; +} // let OtherPredicates = [HasSMemTimeInst] + +let OtherPredicates = [HasNoSMemTimeInst] in { +def : GCNPat < + (i64 (readcyclecounter)), + (REG_SEQUENCE SReg_64, + (S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; +} // let OtherPredicates = [HasNoSMemTimeInst] //===----------------------------------------------------------------------===// // GFX10. @@ -844,7 +883,7 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> : let Inst{16} = !if(ps.has_glc, glc, ?); let Inst{25-18} = op; let Inst{31-26} = 0x3d; - let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?); + let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?); let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding), !if(ps.has_offset, offset{6-0}, ?)); } @@ -853,7 +892,7 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps, SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> { - let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> { let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc); @@ -873,7 +912,7 @@ multiclass SM_Real_Stores_gfx10<bits<8> op, string ps, // FIXME: The operand name $offset is inconsistent with $soff used // in the pseudo def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc); + let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc); } def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> { @@ -1020,3 +1059,12 @@ defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">; defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">; } // End SubtargetPredicate = HasScalarAtomics + +def SMInfoTable : GenericTable { + let FilterClass = "SM_Real"; + let CppTypeName = "SMInfo"; + let Fields = ["Opcode", "is_buffer"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getSMEMOpcodeHelper"; +} diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 73ba2ae367f7b..9d7b25d552170 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1,4 +1,4 @@ -//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===// +//===-- SOPInstructions.td - SOP Instruction Definitions ------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -97,6 +97,17 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo < let has_sdst = 0; } +// Special case for movreld where sdst is treated as a use operand. +class SOP1_32_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SReg_32:$sdst, SSrc_b32:$src0), + "$sdst, $src0", pattern>; + +// Special case for movreld where sdst is treated as a use operand. +class SOP1_64_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SReg_64:$sdst, SSrc_b64:$src0), + "$sdst, $src0", pattern +>; + class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo < opName, (outs), (ins SReg_32:$src0), "$src0", pattern> { @@ -199,7 +210,9 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; -def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; +def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64", + [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))] +>; def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))] @@ -209,7 +222,9 @@ def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] >; -def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">; +def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64", + [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))] +>; def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))] >; @@ -267,8 +282,8 @@ def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">; let Uses = [M0] in { def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">; def S_MOVRELS_B64 : SOP1_64R <"s_movrels_b64">; -def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">; -def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">; +def S_MOVRELD_B32 : SOP1_32_movreld <"s_movreld_b32">; +def S_MOVRELD_B64 : SOP1_64_movreld <"s_movreld_b64">; } // End Uses = [M0] let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in { @@ -283,8 +298,8 @@ def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">; let SubtargetPredicate = HasVGPRIndexMode in { def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> { - let Uses = [M0]; - let Defs = [M0]; + let Uses = [M0, MODE]; + let Defs = [M0, MODE]; } } @@ -401,8 +416,14 @@ class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < class UniformBinFrag<SDPatternOperator Op> : PatFrag < (ops node:$src0, node:$src1), (Op $src0, $src1), - [{ return !N->isDivergent(); }] ->; + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { @@ -444,9 +465,19 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32", } // End isCommutable = 1 } // End Defs = [SCC] +class SelectPat<SDPatternOperator select> : PatFrag < + (ops node:$src1, node:$src2), + (select SCC, $src1, $src2), + [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }] +>; let Uses = [SCC] in { - def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">; + let AddedComplexity = 20 in { + def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", + [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))] + >; + } + def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; } // End Uses = [SCC] @@ -524,22 +555,22 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; } // End Defs = [SCC] @@ -592,14 +623,26 @@ let SubtargetPredicate = isGFX9Plus in { def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; let Defs = [SCC] in { - def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">; - def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">; - def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">; - def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">; + def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32", + [(set i32:$sdst, (shl1_add SSrc_b32:$src0, SSrc_b32:$src1))] + >; + def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32", + [(set i32:$sdst, (shl2_add SSrc_b32:$src0, SSrc_b32:$src1))] + >; + def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32", + [(set i32:$sdst, (shl3_add SSrc_b32:$src0, SSrc_b32:$src1))] + >; + def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32", + [(set i32:$sdst, (shl4_add SSrc_b32:$src0, SSrc_b32:$src1))] + >; } // End Defs = [SCC] - def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">; - def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">; + let isCommutable = 1 in { + def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32", + [(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>; + def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32", + [(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>; + } } // End SubtargetPredicate = isGFX9Plus //===----------------------------------------------------------------------===// @@ -760,7 +803,11 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo < "$sdst, $simm16" >; +let hasSideEffects = 1 in { + let mayLoad = 1 in { +// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow +// its use in the readcyclecounter selection. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), @@ -768,14 +815,20 @@ def S_GETREG_B32 : SOPK_Pseudo < >; } -let hasSideEffects = 1 in { +let mayLoad = 0, mayStore =0 in { def S_SETREG_B32 : SOPK_Pseudo < "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), "$simm16, $sdst", - [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))] ->; + [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> { + + // Use custom inserter to optimize some cases to + // S_DENORM_MODE/S_ROUND_MODE. + let usesCustomInserter = 1; + let Defs = [MODE]; + let Uses = [MODE]; +} // FIXME: Not on SI? //def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">; @@ -786,8 +839,11 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo < "$simm16, $imm"> { let Size = 8; // Unlike every other SOPK instruction. let has_sdst = 0; + let Defs = [MODE]; + let Uses = [MODE]; } +} } // End hasSideEffects = 1 class SOPK_WAITCNT<string opName, list<dag> pat=[]> : @@ -920,12 +976,16 @@ def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>; } // End SubtargetPredicate = isGFX8Plus let SubtargetPredicate = HasVGPRIndexMode in { +// Setting the GPR index mode is really writing the fields in the mode +// register. We don't want to add mode register uses to every +// instruction, and it's too complicated to deal with anyway. This is +// modeled just as a side effect. def S_SET_GPR_IDX_ON : SOPC <0x11, (outs), (ins SSrc_b32:$src0, GPRIdxMode:$src1), "s_set_gpr_idx_on $src0,$src1"> { - let Defs = [M0]; // No scc def - let Uses = [M0]; // Other bits of m0 unmodified. + let Defs = [M0, MODE]; // No scc def + let Uses = [M0, MODE]; // Other bits of mode, m0 unmodified. let hasSideEffects = 1; // Sets mode.gpr_idx_en let FixedSize = 1; } @@ -1099,7 +1159,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> { let mayStore = 1; } -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; @@ -1112,8 +1172,8 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">; def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; } def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; @@ -1138,14 +1198,14 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16", [(int_amdgcn_s_incperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; } def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16", [(int_amdgcn_s_decperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 1; - let mayStore = 1; + let mayLoad = 0; + let mayStore = 0; } def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { let simm16 = 0; @@ -1154,6 +1214,8 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { let SubtargetPredicate = HasVGPRIndexMode in { def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> { let simm16 = 0; + let Defs = [MODE]; + let Uses = [MODE]; } } } // End hasSideEffects @@ -1161,7 +1223,8 @@ def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> { let SubtargetPredicate = HasVGPRIndexMode in { def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16), "s_set_gpr_idx_mode$simm16"> { - let Defs = [M0]; + let Defs = [M0, MODE]; + let Uses = [MODE]; } } @@ -1176,13 +1239,15 @@ let SubtargetPredicate = isGFX10Plus in { } def S_WAITCNT_DEPCTR : SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">; - def S_ROUND_MODE : - SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; - def S_DENORM_MODE : - SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16", - [(SIdenorm_mode (i32 timm:$simm16))]> { - let hasSideEffects = 1; - } + + let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in { + def S_ROUND_MODE : + SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; + def S_DENORM_MODE : + SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16", + [(SIdenorm_mode (i32 timm:$simm16))]>; + } + def S_TTRACEDATA_IMM : SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; } // End SubtargetPredicate = isGFX10Plus @@ -1223,7 +1288,7 @@ def : GCNPat < // Same as a 32-bit inreg def : GCNPat< - (i32 (sext i16:$src)), + (i32 (UniformUnaryFrag<sext> i16:$src)), (S_SEXT_I32_I16 $src) >; @@ -1250,7 +1315,7 @@ def : GCNPat< >; def : GCNPat < - (i64 (sext i16:$src)), + (i64 (UniformUnaryFrag<sext> i16:$src)), (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) >; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 075e08986c0c0..5819a621f55d6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -78,7 +78,11 @@ const char* const IdSymbolic[] = { "HW_REG_XNACK_MASK", nullptr, // HW_ID1, no predictable values nullptr, // HW_ID2, no predictable values - "HW_REG_POPS_PACKER" + "HW_REG_POPS_PACKER", + nullptr, + nullptr, + nullptr, + "HW_REG_SHADER_CYCLES" }; } // namespace Hwreg diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5271bc3aacc65..00e6d517bde58 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -108,6 +108,7 @@ namespace AMDGPU { #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -148,10 +149,17 @@ struct MTBUFInfo { bool has_soffset; }; +struct SMInfo { + uint16_t Opcode; + bool IsBuffer; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL +#define GET_SMInfoTable_DECL +#define GET_SMInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -214,6 +222,11 @@ bool getMUBUFHasSoffset(unsigned Opc) { return Info ? Info->has_soffset : false; } +bool getSMEMIsBuffer(unsigned Opc) { + const SMInfo *Info = getSMEMOpcodeHelper(Opc); + return Info ? Info->IsBuffer : false; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -268,6 +281,13 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) { } unsigned getEUsPerCU(const MCSubtargetInfo *STI) { + // "Per CU" really means "per whatever functional block the waves of a + // workgroup must share". For gfx10 in CU mode this is the CU, which contains + // two SIMDs. + if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode)) + return 2; + // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains + // two CUs, so a total of four SIMDs. return 4; } @@ -283,15 +303,6 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, return std::min(N, 16u); } -unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) { - return getMaxWavesPerEU(STI) * getEUsPerCU(STI); -} - -unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, - unsigned FlatWorkGroupSize) { - return getWavesPerWorkGroup(STI, FlatWorkGroupSize); -} - unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { return 1; } @@ -300,13 +311,13 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. if (!isGFX10(*STI)) return 10; - return 20; + return hasGFX10_3Insts(*STI) ? 16 : 20; } -unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, - unsigned FlatWorkGroupSize) { - return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize), - getEUsPerCU(STI)) / getEUsPerCU(STI); +unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, + unsigned FlatWorkGroupSize) { + return divideCeil(getWavesPerWorkGroup(STI, FlatWorkGroupSize), + getEUsPerCU(STI)); } unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) { @@ -320,8 +331,7 @@ unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) { unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) / - getWavefrontSize(STI); + return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI)); } unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) { @@ -431,12 +441,21 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); + + if (hasGFX10_3Insts(*STI)) + return IsWave32 ? 16 : 8; + return IsWave32 ? 8 : 4; } unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, Optional<bool> EnableWavefrontSize32) { - return getVGPRAllocGranule(STI, EnableWavefrontSize32); + + bool IsWave32 = EnableWavefrontSize32 ? + *EnableWavefrontSize32 : + STI->getFeatureBits().test(FeatureWavefrontSize32); + + return IsWave32 ? 8 : 4; } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { @@ -559,7 +578,7 @@ bool isReadOnlySegment(const GlobalValue *GV) { } bool shouldEmitConstantsToTextSection(const Triple &TT) { - return TT.getOS() == Triple::AMDPAL; + return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600; } int getIntegerAttribute(const Function &F, StringRef Name, int Default) { @@ -722,13 +741,16 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { return ID_SYMBOLIC_FIRST_GFX9_; else if (isGFX9(STI)) return ID_SYMBOLIC_FIRST_GFX10_; + else if (isGFX10(STI) && !isGFX10_BEncoding(STI)) + return ID_SYMBOLIC_FIRST_GFX1030_; else return ID_SYMBOLIC_LAST_; } bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { - return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && - IdSymbolic[Id]; + return + ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI)); } bool isValidHwreg(int64_t Id) { @@ -927,7 +949,15 @@ bool hasSRAMECC(const MCSubtargetInfo &STI) { } bool hasMIMG_R128(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128]; + return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16]; +} + +bool hasGFX10A16(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16]; +} + +bool hasG16(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureG16]; } bool hasPackedD16(const MCSubtargetInfo &STI) { @@ -958,9 +988,17 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } +bool isGFX10_BEncoding(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]; +} + +bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || Reg == AMDGPU::SCC; } @@ -1082,6 +1120,11 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { // (move from MC* level to Target* level). Return size in bits. unsigned getRegBitWidth(unsigned RCID) { switch (RCID) { + case AMDGPU::VGPR_LO16RegClassID: + case AMDGPU::VGPR_HI16RegClassID: + case AMDGPU::SGPR_LO16RegClassID: + case AMDGPU::AGPR_LO16RegClassID: + return 16; case AMDGPU::SGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID: case AMDGPU::VRegOrLds_32RegClassID: @@ -1103,6 +1146,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::SGPR_96RegClassID: case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: + case AMDGPU::AReg_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: @@ -1112,14 +1156,24 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: + case AMDGPU::AReg_160RegClassID: return 160; + case AMDGPU::SGPR_192RegClassID: + case AMDGPU::SReg_192RegClassID: + case AMDGPU::VReg_192RegClassID: + case AMDGPU::AReg_192RegClassID: + return 192; + case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: + case AMDGPU::AReg_256RegClassID: return 256; + case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: case AMDGPU::AReg_512RegClassID: return 512; + case AMDGPU::SGPR_1024RegClassID: case AMDGPU::SReg_1024RegClassID: case AMDGPU::VReg_1024RegClassID: case AMDGPU::AReg_1024RegClassID: @@ -1141,7 +1195,7 @@ unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, } bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { - if (Literal >= -16 && Literal <= 64) + if (isInlinableIntLiteral(Literal)) return true; uint64_t Val = static_cast<uint64_t>(Literal); @@ -1158,7 +1212,7 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { } bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { - if (Literal >= -16 && Literal <= 64) + if (isInlinableIntLiteral(Literal)) return true; // The actual type of the operand does not seem to matter as long @@ -1187,7 +1241,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { if (!HasInv2Pi) return false; - if (Literal >= -16 && Literal <= 64) + if (isInlinableIntLiteral(Literal)) return true; uint16_t Val = static_cast<uint16_t>(Literal); @@ -1217,6 +1271,17 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); } +bool isInlinableIntLiteralV216(int32_t Literal) { + int16_t Lo16 = static_cast<int16_t>(Literal); + if (isInt<16>(Literal) || isUInt<16>(Literal)) + return isInlinableIntLiteral(Lo16); + + int16_t Hi16 = static_cast<int16_t>(Literal >> 16); + if (!(Literal & 0xffff)) + return isInlinableIntLiteral(Hi16); + return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); +} + bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -1247,16 +1312,61 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) { return isGCN3Encoding(ST) || isGFX10(ST); } -int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { +static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) { + return isGFX9(ST) || isGFX10(ST); +} + +bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, + int64_t EncodedOffset) { + return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset) + : isUInt<8>(EncodedOffset); +} + +bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, + int64_t EncodedOffset, + bool IsBuffer) { + return !IsBuffer && + hasSMRDSignedImmOffset(ST) && + isInt<21>(EncodedOffset); +} + +static bool isDwordAligned(uint64_t ByteOffset) { + return (ByteOffset & 3) == 0; +} + +uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, + uint64_t ByteOffset) { if (hasSMEMByteOffset(ST)) return ByteOffset; + + assert(isDwordAligned(ByteOffset)); return ByteOffset >> 2; } -bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { - int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); - return (hasSMEMByteOffset(ST)) ? - isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); +Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset, bool IsBuffer) { + // The signed version is always a byte offset. + if (!IsBuffer && hasSMRDSignedImmOffset(ST)) { + assert(hasSMEMByteOffset(ST)); + return isInt<20>(ByteOffset) ? Optional<int64_t>(ByteOffset) : None; + } + + if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST)) + return None; + + int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset); + return isLegalSMRDEncodedUnsignedOffset(ST, EncodedOffset) + ? Optional<int64_t>(EncodedOffset) + : None; +} + +Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, + int64_t ByteOffset) { + if (!isCI(ST) || !isDwordAligned(ByteOffset)) + return None; + + int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset); + return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None; } // Given Imm, split it into the values to put into the SOffset and ImmOffset @@ -1267,8 +1377,8 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { // aligned if they are aligned to begin with. It also ensures that additional // offsets within the given alignment can be added to the resulting ImmOffset. bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align) { - const uint32_t MaxImm = alignDown(4095, Align); + const GCNSubtarget *Subtarget, Align Alignment) { + const uint32_t MaxImm = alignDown(4095, Alignment.value()); uint32_t Overflow = 0; if (Imm > MaxImm) { @@ -1286,10 +1396,10 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, // // Atomic operations fail to work correctly when individual address // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Align) & ~4095; - uint32_t Low = (Imm + Align) & 4095; + uint32_t High = (Imm + Alignment.value()) & ~4095; + uint32_t Low = (Imm + Alignment.value()) & 4095; Imm = Low; - Overflow = High - Align; + Overflow = High - Alignment.value(); } } @@ -1305,8 +1415,7 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, return true; } -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, - const GCNSubtarget &ST) { +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { *this = getDefaultForCallingConv(F.getCallingConv()); StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); @@ -1318,8 +1427,25 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, if (!DX10ClampAttr.empty()) DX10Clamp = DX10ClampAttr == "true"; - FP32Denormals = ST.hasFP32Denormals(F); - FP64FP16Denormals = ST.hasFP64FP16Denormals(F); + StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); + if (!DenormF32Attr.empty()) { + DenormalMode DenormMode = parseDenormalFPAttribute(DenormF32Attr); + FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } + + StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString(); + if (!DenormAttr.empty()) { + DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr); + + if (DenormF32Attr.empty()) { + FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } + + FP64FP16InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP64FP16OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } } namespace { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a5bada2890d2c..e71554575f6af 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -12,10 +12,10 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "SIDefines.h" -#include "llvm/ADT/StringRef.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetParser.h" @@ -26,17 +26,13 @@ namespace llvm { class Argument; -class AMDGPUSubtarget; -class FeatureBitset; class Function; class GCNSubtarget; class GlobalValue; -class MCContext; class MCRegisterClass; class MCRegisterInfo; -class MCSection; class MCSubtargetInfo; -class MachineMemOperand; +class StringRef; class Triple; namespace AMDGPU { @@ -87,15 +83,6 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI); unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); -/// \returns Maximum number of waves per compute unit for given subtarget \p -/// STI without any kind of limitation. -unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI); - -/// \returns Maximum number of waves per compute unit for given subtarget \p -/// STI and limited by given \p FlatWorkGroupSize. -unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, - unsigned FlatWorkGroupSize); - /// \returns Minimum number of waves per execution unit for given subtarget \p /// STI. unsigned getMinWavesPerEU(const MCSubtargetInfo *STI); @@ -104,10 +91,10 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI); /// STI without any kind of limitation. unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI); -/// \returns Maximum number of waves per execution unit for given subtarget \p -/// STI and limited by given \p FlatWorkGroupSize. -unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, - unsigned FlatWorkGroupSize); +/// \returns Number of waves per execution unit required to support the given \p +/// FlatWorkGroupSize. +unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, + unsigned FlatWorkGroupSize); /// \returns Minimum flat work group size for given subtarget \p STI. unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI); @@ -116,7 +103,7 @@ unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI); unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI); /// \returns Number of waves per work group for given subtarget \p STI and -/// limited by given \p FlatWorkGroupSize. +/// \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); @@ -211,6 +198,7 @@ struct MIMGBaseOpcodeInfo { uint8_t NumExtraArgs; bool Gradients; + bool G16; bool Coordinates; bool LodOrClampOrMip; bool HasD16; @@ -247,11 +235,19 @@ struct MIMGMIPMappingInfo { MIMGBaseOpcode NONMIP; }; +struct MIMGG16MappingInfo { + MIMGBaseOpcode G; + MIMGBaseOpcode G16; +}; + LLVM_READONLY const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); LLVM_READONLY -const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L); +const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); + +LLVM_READONLY +const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -308,6 +304,9 @@ LLVM_READONLY bool getMUBUFHasSoffset(unsigned Opc); LLVM_READONLY +bool getSMEMIsBuffer(unsigned Opc); + +LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, @@ -551,6 +550,8 @@ inline bool isKernel(CallingConv::ID CC) { bool hasXNACK(const MCSubtargetInfo &STI); bool hasSRAMECC(const MCSubtargetInfo &STI); bool hasMIMG_R128(const MCSubtargetInfo &STI); +bool hasGFX10A16(const MCSubtargetInfo &STI); +bool hasG16(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); @@ -558,6 +559,9 @@ bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); +bool isGCN3Encoding(const MCSubtargetInfo &STI); +bool isGFX10_BEncoding(const MCSubtargetInfo &STI); +bool hasGFX10_3Insts(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -633,6 +637,13 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) { return getOperandSize(Desc.OpInfo[OpNo]); } +/// Is this literal inlinable, and not one of the values intended for floating +/// point values. +LLVM_READNONE +inline bool isInlinableIntLiteral(int64_t Literal) { + return Literal >= -16 && Literal <= 64; +} + /// Is this literal inlinable LLVM_READNONE bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi); @@ -646,11 +657,35 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isInlinableIntLiteralV216(int32_t Literal); + bool isArgPassedInSGPR(const Argument *Arg); -/// \returns The encoding that will be used for \p ByteOffset in the SMRD -/// offset field. -int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +LLVM_READONLY +bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, + int64_t EncodedOffset); + +LLVM_READONLY +bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST, + int64_t EncodedOffset, + bool IsBuffer); + +/// Convert \p ByteOffset to dwords if the subtarget uses dword SMRD immediate +/// offsets. +uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset); + +/// \returns The encoding that will be used for \p ByteOffset in the +/// SMRD offset field, or None if it won't fit. On GFX9 and GFX10 +/// S_LOAD instructions have a signed offset, on other subtargets it is +/// unsigned. S_BUFFER has an unsigned offset for all subtargets. +Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST, + int64_t ByteOffset, bool IsBuffer); + +/// \return The encoding that can be used for a 32-bit literal offset in an SMRD +/// instruction. This is only useful on CI.s +Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, + int64_t ByteOffset); /// \returns true if this offset is small enough to fit in the SMRD /// offset field. \p ByteOffset should be the offset in bytes and @@ -658,7 +693,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align = 4); + const GCNSubtarget *Subtarget, + Align Alignment = Align(4)); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); @@ -677,45 +713,76 @@ struct SIModeRegisterDefaults { /// If this is set, neither input or output denormals are flushed for most f32 /// instructions. - /// - /// TODO: Split into separate input and output fields if necessary like the - /// control bits really provide? - bool FP32Denormals : 1; + bool FP32InputDenormals : 1; + bool FP32OutputDenormals : 1; /// If this is set, neither input or output denormals are flushed for both f64 /// and f16/v2f16 instructions. - bool FP64FP16Denormals : 1; + bool FP64FP16InputDenormals : 1; + bool FP64FP16OutputDenormals : 1; SIModeRegisterDefaults() : IEEE(true), DX10Clamp(true), - FP32Denormals(true), - FP64FP16Denormals(true) {} + FP32InputDenormals(true), + FP32OutputDenormals(true), + FP64FP16InputDenormals(true), + FP64FP16OutputDenormals(true) {} - // FIXME: Should not depend on the subtarget - SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST); + SIModeRegisterDefaults(const Function &F); static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { const bool IsCompute = AMDGPU::isCompute(CC); SIModeRegisterDefaults Mode; - Mode.DX10Clamp = true; Mode.IEEE = IsCompute; - Mode.FP32Denormals = false; // FIXME: Should be on by default. - Mode.FP64FP16Denormals = true; return Mode; } bool operator ==(const SIModeRegisterDefaults Other) const { return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp && - FP32Denormals == Other.FP32Denormals && - FP64FP16Denormals == Other.FP64FP16Denormals; + FP32InputDenormals == Other.FP32InputDenormals && + FP32OutputDenormals == Other.FP32OutputDenormals && + FP64FP16InputDenormals == Other.FP64FP16InputDenormals && + FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals; + } + + bool allFP32Denormals() const { + return FP32InputDenormals && FP32OutputDenormals; + } + + bool allFP64FP16Denormals() const { + return FP64FP16InputDenormals && FP64FP16OutputDenormals; + } + + /// Get the encoding value for the FP_DENORM bits of the mode register for the + /// FP32 denormal mode. + uint32_t fpDenormModeSPValue() const { + if (FP32InputDenormals && FP32OutputDenormals) + return FP_DENORM_FLUSH_NONE; + if (FP32InputDenormals) + return FP_DENORM_FLUSH_OUT; + if (FP32OutputDenormals) + return FP_DENORM_FLUSH_IN; + return FP_DENORM_FLUSH_IN_FLUSH_OUT; + } + + /// Get the encoding value for the FP_DENORM bits of the mode register for the + /// FP64/FP16 denormal mode. + uint32_t fpDenormModeDPValue() const { + if (FP64FP16InputDenormals && FP64FP16OutputDenormals) + return FP_DENORM_FLUSH_NONE; + if (FP64FP16InputDenormals) + return FP_DENORM_FLUSH_OUT; + if (FP64FP16OutputDenormals) + return FP_DENORM_FLUSH_IN; + return FP_DENORM_FLUSH_IN_FLUSH_OUT; } /// Returns true if a flag is compatible if it's enabled in the callee, but /// disabled in the caller. static bool oneWayCompatible(bool CallerMode, bool CalleeMode) { - return CallerMode == CalleeMode || (CallerMode && !CalleeMode); + return CallerMode == CalleeMode || (!CallerMode && CalleeMode); } // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should @@ -727,8 +794,10 @@ struct SIModeRegisterDefaults { return false; // Allow inlining denormals enabled into denormals flushed functions. - return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) && - oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals); + return oneWayCompatible(FP64FP16InputDenormals, CalleeMode.FP64FP16InputDenormals) && + oneWayCompatible(FP64FP16OutputDenormals, CalleeMode.FP64FP16OutputDenormals) && + oneWayCompatible(FP32InputDenormals, CalleeMode.FP32InputDenormals) && + oneWayCompatible(FP32OutputDenormals, CalleeMode.FP32OutputDenormals); } }; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 207e4232e8298..ef010a7ac1576 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -397,6 +397,39 @@ static const char *getRegisterName(unsigned RegNum) { {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"}, {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"}, + {0x2c8c, "SPI_SHADER_USER_DATA_GS_0"}, + {0x2c8d, "SPI_SHADER_USER_DATA_GS_1"}, + {0x2c8e, "SPI_SHADER_USER_DATA_GS_2"}, + {0x2c8f, "SPI_SHADER_USER_DATA_GS_3"}, + {0x2c90, "SPI_SHADER_USER_DATA_GS_4"}, + {0x2c91, "SPI_SHADER_USER_DATA_GS_5"}, + {0x2c92, "SPI_SHADER_USER_DATA_GS_6"}, + {0x2c93, "SPI_SHADER_USER_DATA_GS_7"}, + {0x2c94, "SPI_SHADER_USER_DATA_GS_8"}, + {0x2c95, "SPI_SHADER_USER_DATA_GS_9"}, + {0x2c96, "SPI_SHADER_USER_DATA_GS_10"}, + {0x2c97, "SPI_SHADER_USER_DATA_GS_11"}, + {0x2c98, "SPI_SHADER_USER_DATA_GS_12"}, + {0x2c99, "SPI_SHADER_USER_DATA_GS_13"}, + {0x2c9a, "SPI_SHADER_USER_DATA_GS_14"}, + {0x2c9b, "SPI_SHADER_USER_DATA_GS_15"}, + {0x2c9c, "SPI_SHADER_USER_DATA_GS_16"}, + {0x2c9d, "SPI_SHADER_USER_DATA_GS_17"}, + {0x2c9e, "SPI_SHADER_USER_DATA_GS_18"}, + {0x2c9f, "SPI_SHADER_USER_DATA_GS_19"}, + {0x2ca0, "SPI_SHADER_USER_DATA_GS_20"}, + {0x2ca1, "SPI_SHADER_USER_DATA_GS_21"}, + {0x2ca2, "SPI_SHADER_USER_DATA_GS_22"}, + {0x2ca3, "SPI_SHADER_USER_DATA_GS_23"}, + {0x2ca4, "SPI_SHADER_USER_DATA_GS_24"}, + {0x2ca5, "SPI_SHADER_USER_DATA_GS_25"}, + {0x2ca6, "SPI_SHADER_USER_DATA_GS_26"}, + {0x2ca7, "SPI_SHADER_USER_DATA_GS_27"}, + {0x2ca8, "SPI_SHADER_USER_DATA_GS_28"}, + {0x2ca9, "SPI_SHADER_USER_DATA_GS_29"}, + {0x2caa, "SPI_SHADER_USER_DATA_GS_30"}, + {0x2cab, "SPI_SHADER_USER_DATA_GS_31"}, + {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"}, {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"}, {0x2cce, "SPI_SHADER_USER_DATA_ES_2"}, @@ -491,38 +524,55 @@ static const char *getRegisterName(unsigned RegNum) { {0xa310, "PA_SC_SHADER_CONTROL"}, {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"}, - {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"}, - {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"}, - {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"}, - {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"}, - {0x2d10, "SPI_SHADER_USER_DATA_LS_4"}, - {0x2d11, "SPI_SHADER_USER_DATA_LS_5"}, - {0x2d12, "SPI_SHADER_USER_DATA_LS_6"}, - {0x2d13, "SPI_SHADER_USER_DATA_LS_7"}, - {0x2d14, "SPI_SHADER_USER_DATA_LS_8"}, - {0x2d15, "SPI_SHADER_USER_DATA_LS_9"}, - {0x2d16, "SPI_SHADER_USER_DATA_LS_10"}, - {0x2d17, "SPI_SHADER_USER_DATA_LS_11"}, - {0x2d18, "SPI_SHADER_USER_DATA_LS_12"}, - {0x2d19, "SPI_SHADER_USER_DATA_LS_13"}, - {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"}, - {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"}, - {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"}, - {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"}, - {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"}, - {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"}, - {0x2d20, "SPI_SHADER_USER_DATA_LS_20"}, - {0x2d21, "SPI_SHADER_USER_DATA_LS_21"}, - {0x2d22, "SPI_SHADER_USER_DATA_LS_22"}, - {0x2d23, "SPI_SHADER_USER_DATA_LS_23"}, - {0x2d24, "SPI_SHADER_USER_DATA_LS_24"}, - {0x2d25, "SPI_SHADER_USER_DATA_LS_25"}, - {0x2d26, "SPI_SHADER_USER_DATA_LS_26"}, - {0x2d27, "SPI_SHADER_USER_DATA_LS_27"}, - {0x2d28, "SPI_SHADER_USER_DATA_LS_28"}, - {0x2d29, "SPI_SHADER_USER_DATA_LS_29"}, - {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"}, - {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"}, + {0x2d0c, "SPI_SHADER_USER_DATA_HS_0"}, + {0x2d0d, "SPI_SHADER_USER_DATA_HS_1"}, + {0x2d0e, "SPI_SHADER_USER_DATA_HS_2"}, + {0x2d0f, "SPI_SHADER_USER_DATA_HS_3"}, + {0x2d10, "SPI_SHADER_USER_DATA_HS_4"}, + {0x2d11, "SPI_SHADER_USER_DATA_HS_5"}, + {0x2d12, "SPI_SHADER_USER_DATA_HS_6"}, + {0x2d13, "SPI_SHADER_USER_DATA_HS_7"}, + {0x2d14, "SPI_SHADER_USER_DATA_HS_8"}, + {0x2d15, "SPI_SHADER_USER_DATA_HS_9"}, + {0x2d16, "SPI_SHADER_USER_DATA_HS_10"}, + {0x2d17, "SPI_SHADER_USER_DATA_HS_11"}, + {0x2d18, "SPI_SHADER_USER_DATA_HS_12"}, + {0x2d19, "SPI_SHADER_USER_DATA_HS_13"}, + {0x2d1a, "SPI_SHADER_USER_DATA_HS_14"}, + {0x2d1b, "SPI_SHADER_USER_DATA_HS_15"}, + {0x2d1c, "SPI_SHADER_USER_DATA_HS_16"}, + {0x2d1d, "SPI_SHADER_USER_DATA_HS_17"}, + {0x2d1e, "SPI_SHADER_USER_DATA_HS_18"}, + {0x2d1f, "SPI_SHADER_USER_DATA_HS_19"}, + {0x2d20, "SPI_SHADER_USER_DATA_HS_20"}, + {0x2d21, "SPI_SHADER_USER_DATA_HS_21"}, + {0x2d22, "SPI_SHADER_USER_DATA_HS_22"}, + {0x2d23, "SPI_SHADER_USER_DATA_HS_23"}, + {0x2d24, "SPI_SHADER_USER_DATA_HS_24"}, + {0x2d25, "SPI_SHADER_USER_DATA_HS_25"}, + {0x2d26, "SPI_SHADER_USER_DATA_HS_26"}, + {0x2d27, "SPI_SHADER_USER_DATA_HS_27"}, + {0x2d28, "SPI_SHADER_USER_DATA_HS_28"}, + {0x2d29, "SPI_SHADER_USER_DATA_HS_29"}, + {0x2d2a, "SPI_SHADER_USER_DATA_HS_30"}, + {0x2d2b, "SPI_SHADER_USER_DATA_HS_31"}, + + {0x2d4c, "SPI_SHADER_USER_DATA_LS_0"}, + {0x2d4d, "SPI_SHADER_USER_DATA_LS_1"}, + {0x2d4e, "SPI_SHADER_USER_DATA_LS_2"}, + {0x2d4f, "SPI_SHADER_USER_DATA_LS_3"}, + {0x2d50, "SPI_SHADER_USER_DATA_LS_4"}, + {0x2d51, "SPI_SHADER_USER_DATA_LS_5"}, + {0x2d52, "SPI_SHADER_USER_DATA_LS_6"}, + {0x2d53, "SPI_SHADER_USER_DATA_LS_7"}, + {0x2d54, "SPI_SHADER_USER_DATA_LS_8"}, + {0x2d55, "SPI_SHADER_USER_DATA_LS_9"}, + {0x2d56, "SPI_SHADER_USER_DATA_LS_10"}, + {0x2d57, "SPI_SHADER_USER_DATA_LS_11"}, + {0x2d58, "SPI_SHADER_USER_DATA_LS_12"}, + {0x2d59, "SPI_SHADER_USER_DATA_LS_13"}, + {0x2d5a, "SPI_SHADER_USER_DATA_LS_14"}, + {0x2d5b, "SPI_SHADER_USER_DATA_LS_15"}, {0xa2aa, "IA_MULTI_VGT_PARAM"}, {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index 0f17c157b2062..544ab669d9ae2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -14,16 +14,12 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H -#include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/MsgPackDocument.h" -#include <map> namespace llvm { -class AMDGPUTargetStreamer; -class formatted_raw_ostream; -class MCStreamer; class Module; +class StringRef; class AMDGPUPALMetadata { unsigned BlobType = 0; diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td deleted file mode 100644 index ec7d8875a746e..0000000000000 --- a/llvm/lib/Target/AMDGPU/VIInstructions.td +++ /dev/null @@ -1,13 +0,0 @@ -//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// Instruction definitions for VI and newer. -//===----------------------------------------------------------------------===// - -FIXME: Deleting this file broke buildbots that don't do full rebuilds. This -file is no longer used by the backend, so it can be deleted once all -the buildbots update there dependencies. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index c7aed0985540a..17f334f62a30b 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1,4 +1,4 @@ -//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===// +//===-- VOP1Instructions.td - Vector Instruction Definitions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -48,9 +48,13 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On let mayStore = 0; let hasSideEffects = 0; + let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + + let mayRaiseFPException = ReadsModeReg; + let VOP1 = 1; let VALU = 1; - let Uses = [EXEC]; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let AsmVariantName = AMDGPUAsmVariants.Default; } @@ -89,9 +93,7 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, - [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, - i1:$clamp, i32:$omod))))], + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))], !if(P.HasOMod, [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0, i1:$clamp, i32:$omod))))], @@ -102,8 +104,13 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { multiclass VOP1Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag> { - def _e32 : VOP1_Pseudo <opName, P>; - def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; + // We only want to set this on the basic, non-SDWA or DPP forms. + defvar should_mov_imm = !eq(opName, "v_mov_b32"); + + let isMoveImm = should_mov_imm in { + def _e32 : VOP1_Pseudo <opName, P>; + def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; + } foreach _ = BoolToList<P.HasExtSDWA>.ret in def _sdwa : VOP1_SDWA_Pseudo <opName, P>; @@ -146,7 +153,7 @@ let VOPAsmPrefer32Bit = 1 in { defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>; } -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; } // End isMoveImm = 1 @@ -183,31 +190,51 @@ def V_READFIRSTLANE_B32 : let SchedRW = [WriteDoubleCvt] in { defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>; + +let mayRaiseFPException = 0 in { defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +} + defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>; + +let mayRaiseFPException = 0 in { defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>; +} + } // End SchedRW = [WriteDoubleCvt] -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteFloatCvt] in { + +// XXX: Does this really not raise exceptions? The manual claims the +// 16-bit ones can. +let mayRaiseFPException = 0 in { defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; +} + defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; let FPDPRounding = 1 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; } // End FPDPRounding = 1 + defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; + +let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; -} // End SchedRW = [WriteQuarterRate32] +} // End ReadsModeReg = 0, mayRaiseFPException = 0 +} // End SchedRW = [WriteFloatCvt] +let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>; defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>; defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>; defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>; +} // ReadsModeReg = 0, mayRaiseFPException = 0 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>; defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; @@ -215,33 +242,30 @@ defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>; defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteTrans32] in { defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; -defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; -} // End SchedRW = [WriteQuarterRate32] +defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>; +} // End SchedRW = [WriteTrans32] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteTrans64] in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; -} // End SchedRW = [WriteDouble]; - -let SchedRW = [WriteDouble] in { -defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>; -} // End SchedRW = [WriteDouble] +defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>; +} // End SchedRW = [WriteTrans64] -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; -} // End SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; -defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; let SchedRW = [WriteDoubleAdd] in { @@ -317,7 +341,7 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>; defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; let SubtargetPredicate = isGFX6GFX7 in { - let SchedRW = [WriteQuarterRate32] in { + let SchedRW = [WriteTrans32] in { defm V_LOG_CLAMP_F32 : VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : @@ -327,8 +351,8 @@ let SubtargetPredicate = isGFX6GFX7 in { defm V_RSQ_CLAMP_F32 : VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>; defm V_RSQ_LEGACY_F32 : - VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>; - } // End SchedRW = [WriteQuarterRate32] + VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>; + } // End SchedRW = [WriteTrans32] let SchedRW = [WriteDouble] in { defm V_RCP_CLAMP_F64 : @@ -339,10 +363,10 @@ let SubtargetPredicate = isGFX6GFX7 in { } // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX7GFX8GFX9 in { - let SchedRW = [WriteQuarterRate32] in { + let SchedRW = [WriteTrans32] in { defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>; defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>; - } // End SchedRW = [WriteQuarterRate32] + } // End SchedRW = [WriteTrans32] } // End SubtargetPredicate = isGFX7GFX8GFX9 let SubtargetPredicate = isGFX7Plus in { @@ -362,15 +386,15 @@ defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; } // End FPDPRounding = 1 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; -defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; +defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; -} // End SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>; defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; @@ -414,8 +438,11 @@ let SubtargetPredicate = isGFX9Plus in { } defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; - defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; - defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; + + let mayRaiseFPException = 0 in { + defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; + defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; + } // End mayRaiseFPException = 0 } // End SubtargetPredicate = isGFX9Plus let SubtargetPredicate = isGFX9Only in { @@ -458,7 +485,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1 class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> : VOP1_DPP<op, ps, p, 1>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> { - let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; } @@ -475,7 +502,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> : let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0); let Inst{31-25} = 0x3f; - let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let AssemblerPredicate = HasDPP8; let SubtargetPredicate = HasDPP8; } @@ -812,42 +839,23 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs), let SubtargetPredicate = isGFX8GFX9; } -// This is a pseudo variant of the v_movreld_b32 instruction in which the -// vector operand appears only twice, once as def and once as use. Using this -// pseudo avoids problems with the Two Address instructions pass. -class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI < - (outs rc:$vdst), - (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> { - let VOP1 = 1; - - let Constraints = "$vsrc = $vdst"; - let Uses = [M0, EXEC]; - - let SubtargetPredicate = HasMovrel; -} - -def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>; -def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>; -def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; -def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; -def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; - let OtherPredicates = [isGFX8Plus] in { def : GCNPat < - (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, - timm:$bound_ctrl)), - (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, + timm:$bank_mask, timm:$bound_ctrl)), + (V_MOV_B32_dpp VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask, - timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, + timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), + (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; } // End OtherPredicates = [isGFX8Plus] @@ -907,6 +915,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; let OtherPredicates = [isGFX10Plus] in { def : GCNPat < (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), - (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0)) + (V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src, + (as_i32timm $dpp8), (i32 DPP8Mode.FI_0)) >; } // End OtherPredicates = [isGFX10Plus] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index aaadc3dbc7215..aa37dbf1418f9 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1,4 +1,4 @@ -//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===// +//===-- VOP2Instructions.td - Vector Instruction Definitions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -69,9 +69,13 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf let mayStore = 0; let hasSideEffects = 0; + let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + + let mayRaiseFPException = ReadsModeReg; + let VOP2 = 1; let VALU = 1; - let Uses = [EXEC]; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let AsmVariantName = AMDGPUAsmVariants.Default; } @@ -459,17 +463,18 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { //===----------------------------------------------------------------------===// defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; +let SubtargetPredicate = HasMadMacF32Insts in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { -defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; +defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>; defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>; defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>; -defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>; -defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>; +defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>; +defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>; defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>; -defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>; +defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>; defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>; defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>; defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>; @@ -484,12 +489,16 @@ defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; +let mayRaiseFPException = 0 in { +let SubtargetPredicate = HasMadMacF32Insts in { let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>; +} // End SubtargetPredicate = HasMadMacF32Insts +} // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -529,8 +538,12 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" + +let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>; defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>; +} + defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>; defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>; @@ -541,14 +554,18 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; } // End SubtargetPredicate = isGFX6GFX7 -let SubtargetPredicate = isGFX6GFX7GFX10 in { let isCommutable = 1 in { +let SubtargetPredicate = isGFX6GFX7GFX10 in { +let OtherPredicates = [HasMadMacF32Insts] in defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>; -} // End isCommutable = 1 } // End SubtargetPredicate = isGFX6GFX7GFX10 +let SubtargetPredicate = isGFX6GFX7 in { +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>; +} // End SubtargetPredicate = isGFX6GFX7 +} // End isCommutable = 1 + class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : GCNPat< @@ -617,15 +634,19 @@ defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>; let isCommutable = 1 in { let FPDPRounding = 1 in { -defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; +defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, any_fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; -defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; +defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, any_fmul>; + +let mayRaiseFPException = 0 in { def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; +} + } // End FPDPRounding = 1 -defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>; -defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>; -defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; +defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; +defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>; +defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>; defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; @@ -770,16 +791,16 @@ let Predicates = [Has16BitInsts] in { // an inline immediate than -c. // TODO: Also do for 64-bit. def : GCNPat< - (add i16:$src0, (i16 NegSubInlineConst16:$src1)), - (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1) + (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)), + (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) >; let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in { def : GCNPat< - (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))), - (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1) + (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))), + (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1) >; defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>; @@ -831,7 +852,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps, class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, string opName = ps.OpName, VOPProfile p = ps.Pfl> : VOP2_DPP<op, ps, opName, p, 1> { - let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst); + let AssemblerPredicate = HasDPP16; let SubtargetPredicate = HasDPP16; } @@ -857,7 +878,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, let Inst{30-25} = op; let Inst{31} = 0x0; - let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst); + let AssemblerPredicate = HasDPP8; let SubtargetPredicate = HasDPP8; } @@ -1250,9 +1271,9 @@ defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>; defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>; -let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>; -} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) let SubtargetPredicate = isGFX6GFX7 in { defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>; @@ -1261,6 +1282,7 @@ let SubtargetPredicate = isGFX6GFX7 in { defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +let OtherPredicates = [HasMadMacF32Insts] in defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; @@ -1593,3 +1615,9 @@ let SubtargetPredicate = HasDot3Insts in { let SubtargetPredicate = HasPkFmacF16Inst in { defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>; } // End SubtargetPredicate = HasPkFmacF16Inst + +let SubtargetPredicate = HasDot3Insts in { + // NB: Opcode conflicts with V_DOT2C_F32_F16 + let DecoderNamespace = "GFX10_B" in + defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>; +} diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 67c8b926302d5..169949f2171ae 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1,4 +1,4 @@ -//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===// +//===-- VOP3Instructions.td - Vector Instruction Definitions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -32,20 +32,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { ret1)); } -class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { +class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> { + dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)); + dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)); + dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)); + dag clamp_dag = (i1 timm:$clamp); + list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), - (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), - (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), - (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), - (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + !if(HasExplicitClamp, + (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag), + (DivergentFragOrOp<node, P>.ret src0_dag)))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -54,18 +60,16 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), - (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), - (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), - (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -74,18 +78,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers), (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)), (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -224,12 +228,13 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : VOP3_Pseudo<OpName, P, pattern> { let AsmMatchConverter = "cvtVOP3Interp"; + let mayRaiseFPException = 0; } def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, Attr:$attr, AttrChan:$attrchan, - clampmod:$clamp, omod:$omod); + clampmod0:$clamp, omod0:$omod); let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod"; } @@ -237,7 +242,7 @@ def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { let Ins64 = (ins InterpSlot:$src0, Attr:$attr, AttrChan:$attrchan, - clampmod:$clamp, omod:$omod); + clampmod0:$clamp, omod0:$omod); let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod"; @@ -286,17 +291,25 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { let isCommutable = 1 in { +let mayRaiseFPException = 0 in { +let SubtargetPredicate = HasMadMacF32Insts in { def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>; +} // End SubtargetPredicate = HasMadMacInsts + +let SubtargetPredicate = HasNoMadMacF32Insts in +def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; +} + def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; -def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>; +def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { let FPDPRounding = 1 in { -def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; -def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>; +def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>; +def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>; } // End FPDPRounding = 1 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>; @@ -310,7 +323,7 @@ def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>; def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>; } // End SchedRW = [WriteQuarterRate32] -let Uses = [VCC, EXEC] in { +let Uses = [MODE, VCC, EXEC] in { // v_div_fmas_f32: // result = src0 * src1 + src2 // if (vcc) @@ -332,15 +345,20 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []> } // End isCommutable = 1 +let mayRaiseFPException = 0 in { def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>; def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>; def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>; def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>; +} // End mayRaiseFPException + def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>; def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>; def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>; -def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>; +def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>; def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; + +let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>; def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>; def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>; @@ -350,6 +368,8 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>; def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>; +} // End mayRaiseFPException = 0 + def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; @@ -362,6 +382,8 @@ def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>; } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 + +let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; let AsmMatchConverter = ""; @@ -373,6 +395,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, let AsmMatchConverter = ""; let FPDPRounding = 1; } +} // End mayRaiseFPException = 0 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; @@ -380,17 +403,16 @@ let Constraints = "@earlyclobber $vdst" in { def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; } // End Constraints = "@earlyclobber $vdst" -def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> { +def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop> { let SchedRW = [WriteDouble]; } let SchedRW = [Write64Bit] in { -let SubtargetPredicate = isGFX6GFX7GFX10 in { +let SubtargetPredicate = isGFX6GFX7 in { def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>; def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>; def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>; -def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isGFX6GFX7GFX10 +} // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX8Plus in { def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>; @@ -399,6 +421,23 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] +def : GCNPat< + (i64 (getDivergentFrag<sext>.ret i16:$src)), + (REG_SEQUENCE VReg_64, + (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0, + (i32 (COPY_TO_REGCLASS + (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) + ), VGPR_32)), sub1) +>; + +def : GCNPat< + (i32 (getDivergentFrag<sext>.ret i16:$src)), + (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) +>; + +let SubtargetPredicate = isGFX6GFX7GFX10 in { +def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; +} // End SubtargetPredicate = isGFX6GFX7GFX10 let SchedRW = [Write32Bit] in { let SubtargetPredicate = isGFX8Plus in { @@ -417,7 +456,7 @@ let isCommutable = 1 in { let SchedRW = [WriteQuarterRate32, WriteSALU] in { def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; -} // End SchedRW = [WriteDouble, WriteSALU] +} // End SchedRW = [WriteQuarterRate32, WriteSALU] } // End isCommutable = 1 } // End SubtargetPredicate = isGFX7Plus @@ -434,11 +473,11 @@ def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", let FPDPRounding = 1; } -def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> { +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma> { let Predicates = [Has16BitInsts, isGFX8Only]; let FPDPRounding = 1; } -def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> { +def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9Plus]; let FPDPRounding = 1; @@ -451,7 +490,7 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>; let FPDPRounding = 1 in { def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; -let Uses = [M0, EXEC] in { +let Uses = [MODE, M0, EXEC] in { // For some reason the intrinsic operands are in a different order // from the instruction operands. def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, @@ -462,7 +501,7 @@ def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i3 (i32 timm:$attr), (i1 timm:$high), M0))]>; -} // End Uses = [M0, EXEC] +} // End Uses = [M0, MODE, EXEC] } // End FPDPRounding = 1 } // End renamedInGFX9 = 1 @@ -478,32 +517,29 @@ def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9Plus -let Uses = [M0, EXEC], FPDPRounding = 1 in { +let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, - [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan), - (i32 timm:$attr), - (i32 timm:$src0_modifiers), - (i1 timm:$high), - (i1 timm:$clamp), - (i32 timm:$omod)))]>; -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, - [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan), - (i32 timm:$attr), - (i32 timm:$src0_modifiers), - (f32 VRegSrc_32:$src2), - (i32 timm:$src2_modifiers), - (i1 timm:$high), - (i1 timm:$clamp), - (i32 timm:$omod)))]>; -} // End Uses = [M0, EXEC], FPDPRounding = 1 + [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers), + (i32 timm:$attrchan), + (i32 timm:$attr), + (i1 timm:$high), M0))]> { + // This predicate should only apply to the selection pattern. The + // instruction still exists and should decode on subtargets with + // other bank counts. + let OtherPredicates = [has32BankLDS]; +} + + +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 -let SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC] in { +let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; -} // End SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC] +} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { @@ -565,9 +601,20 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< } return true; - }] -> { + }]> { let PredicateCodeUsesOperands = 1; + + // The divergence predicate is irrelevant in GlobalISel, as we have + // proper register bank checks. We also force all VOP instruction + // operands to VGPR, so we should not need to check the constant bus + // restriction. + // + // FIXME: With unlucky SGPR operands, we could penalize code by + // blocking folding SGPR->VGPR copies later. + // FIXME: There's no register bank verifier + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; } let SubtargetPredicate = isGFX9Plus in { @@ -602,14 +649,14 @@ def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; -def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; -def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; +def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>; +def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>; class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2), - (inst i32:$src0, i32:$src1, i32:$src2) + (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) >; def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>; @@ -634,6 +681,40 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3 let HasOMod = 0; } +class PermlanePat<SDPatternOperator permlane, + Instruction inst> : GCNPat< + (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc), + (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc), + SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) +>; + +// Permlane intrinsic that has either fetch invalid or bound control +// fields enabled. +class BoundControlOrFetchInvalidPermlane<SDPatternOperator permlane> : + PatFrag<(ops node:$vdst_in, node:$src0, node:$src1, node:$src2, + node:$fi, node:$bc), + (permlane node:$vdst_in, node:$src0, node: + $src1, node:$src2, node:$fi, node:$bc)> { + let PredicateCode = [{ return N->getConstantOperandVal(5) != 0 || + N->getConstantOperandVal(6) != 0; }]; + let GISelPredicateCode = [{ + return MI.getOperand(6).getImm() != 0 || + MI.getOperand(7).getImm() != 0; + }]; +} + +// Drop the input value if it won't be read. +class PermlaneDiscardVDstIn<SDPatternOperator permlane, + Instruction inst> : GCNPat< + (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2, + timm:$fi, timm:$bc), + (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc), + SCSrc_b32:$src1, 0, SCSrc_b32:$src2, + (IMPLICIT_DEF)) +>; + + let SubtargetPredicate = isGFX10Plus in { def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>; @@ -643,16 +724,35 @@ let SubtargetPredicate = isGFX10Plus in { def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in - def : GCNPat< - (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), - (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) - >; - def : GCNPat< - (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), - (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in) - >; + def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32>; + def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32>; + + def : PermlaneDiscardVDstIn< + BoundControlOrFetchInvalidPermlane<int_amdgcn_permlane16>, + V_PERMLANE16_B32>; + def : PermlaneDiscardVDstIn< + BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>, + V_PERMLANEX16_B32>; } // End SubtargetPredicate = isGFX10Plus +class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat< + (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), + (vt (VOP3Mods vt:$src2, i32:$src2_modifiers)), + (i1 CondReg)), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2) +>; + +let WaveSizePredicate = isWave64 in { +def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC>; +def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC>; +} + +let WaveSizePredicate = isWave32 in { +def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC_LO>; +def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC_LO>; +} + //===----------------------------------------------------------------------===// // Integer Clamp Patterns //===----------------------------------------------------------------------===// @@ -745,9 +845,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>; -let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>; -} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>; defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>; @@ -925,6 +1025,10 @@ defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>; defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>; +// NB: Same opcode as v_mad_legacy_f32 +let DecoderNamespace = "GFX10_B" in +defm V_FMA_LEGACY_F32 : VOP3_Real_gfx10<0x140>; + //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 933acc2278fd8..fc457ad212d48 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1,4 +1,4 @@ -//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// +//===-- VOP3PInstructions.td - Vector Instruction Definitions -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,9 +10,11 @@ // VOP3P Classes //===----------------------------------------------------------------------===// -class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : +class VOP3PInst<string OpName, VOPProfile P, + SDPatternOperator node = null_frag, + bit HasExplicitClamp = 0> : VOP3P_Pseudo<OpName, P, - !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) + !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret) >; // Non-packed instructions that use the VOP3P encoding. @@ -29,9 +31,14 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, !con( (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2, - clampmod:$clamp), - !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))), + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2), + // FIXME: clampmod0 misbehaves with the non-default vdst_in + // following it. For now workaround this by requiring clamp + // in tied patterns. This should use undef_tied_input, but it + // seems underdeveloped and doesn't apply the right register + // class constraints. + !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), + (ins clampmod0:$clamp))), (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi)); let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); @@ -45,9 +52,9 @@ def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; let FPDPRounding = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; -def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; -def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; +def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; +def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; } // End FPDPRounding = 1 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; @@ -75,8 +82,8 @@ def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I1 // The constant will be emitted as a mov, and folded later. // TODO: We could directly encode the immediate now def : GCNPat< - (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), - (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) + (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), + (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) >; multiclass MadFmaMixPats<SDPatternOperator fma_like, @@ -142,10 +149,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, } let SubtargetPredicate = HasMadMixInsts in { + // These are VOP3a-like opcodes which accept no omod. // Size of src arguments (16/32) is controlled by op_sel. // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. -let isCommutable = 1 in { +let isCommutable = 1, mayRaiseFPException = 0 in { def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; let FPDPRounding = 1 in { @@ -203,7 +211,7 @@ foreach Type = ["I", "U"] in foreach Index = 0-3 in { // Defines patterns that extract each Index'ed 8bit from an unsigned // 32bit scalar value; - def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>; + def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>; // Defines multiplication patterns where the multiplication is happening on each // Index'ed 8bit of a 32bit scalar value. @@ -211,8 +219,8 @@ foreach Type = ["I", "U"] in def Mul#Type#_Elt#Index : PatFrag< (ops node:$src0, node:$src1), (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) - (!cast<Extract>(#Type#Index#"_8bit") node:$src0), - (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>; + (!cast<Extract>(Type#Index#"_8bit") node:$src0), + (!cast<Extract>(Type#Index#"_8bit") node:$src1))>; } // Different variants of dot8 patterns cause a huge increase in the compile time. @@ -231,15 +239,15 @@ foreach Type = ["I", "U"] in foreach Index = 0-7 in { // Defines patterns that extract each Index'ed 4bit from an unsigned // 32bit scalar value; - def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>; + def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>; // Defines multiplication patterns where the multiplication is happening on each // Index'ed 8bit of a 32bit scalar value. def Mul#Type#Index#"_4bit" : PatFrag< (ops node:$src0, node:$src1), (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) - (!cast<Extract>(#Type#Index#"_4bit") node:$src0), - (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>; + (!cast<Extract>(Type#Index#"_4bit") node:$src0), + (!cast<Extract>(Type#Index#"_4bit") node:$src1))>; } class UDot2Pat<Instruction Inst> : GCNPat < @@ -264,40 +272,30 @@ class SDot2Pat<Instruction Inst> : GCNPat < let IsDOT = 1 in { let SubtargetPredicate = HasDot2Insts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", + VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, + AMDGPUfdot2, 1/*ExplicitClamp*/>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", + VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", + VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; +def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", + VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; +def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", + VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; } // End SubtargetPredicate = HasDot2Insts let SubtargetPredicate = HasDot1Insts in { -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", + VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", + VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; } // End SubtargetPredicate = HasDot1Insts } // End let IsDOT = 1 -multiclass DotPats<SDPatternOperator dot_op, - VOP3PInst dot_inst> { - let SubtargetPredicate = dot_inst.SubtargetPredicate in - def : GCNPat < - (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), - (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), - (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), - (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; -} - -defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>; -defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>; -defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>; -defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>; -defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; -defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; -defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; - def : UDot2Pat<V_DOT2_U32_U16>; def : SDot2Pat<V_DOT2_I32_I16>; @@ -368,12 +366,16 @@ def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, A def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; let Predicates = [HasMAIInsts] in { + +let isAsCheapAsAMove = 1, isReMaterializable = 1 in { def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { let isMoveImm = 1; } +} -let isConvergent = 1 in { +// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. +let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; @@ -394,7 +396,7 @@ def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I3 def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; -} // End isConvergent = 1 +} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } // End SubtargetPredicate = HasMAIInsts diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 39d18794f947b..aa2fa260e7b52 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1,4 +1,4 @@ -//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===// +//===-- VOPCInstructions.td - Vector Instruction Definitions --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -92,9 +92,11 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[], let mayStore = 0; let hasSideEffects = 0; + let ReadsModeReg = isFloatType<P.Src0VT>.ret; + let VALU = 1; let VOPC = 1; - let Uses = [EXEC]; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let Defs = !if(DefVcc, [VCC], []); VOPProfile Pfl = P; @@ -738,6 +740,9 @@ multiclass VOPC_CLASS_F64 <string opName> : multiclass VOPCX_CLASS_F64 <string opName> : VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>; +// cmp_class ignores the FP mode and faithfully reports the unmodified +// source value. +let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; @@ -747,6 +752,7 @@ let SubtargetPredicate = Has16BitInsts in { defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; } +} // End ReadsModeReg = 0, mayRaiseFPException = 0 //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f208a1134a5a4..f8a83e5f74c0b 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1,4 +1,4 @@ -//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===// +//===-- VOPInstructions.td - Vector Instruction Definitions ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,6 +8,8 @@ // dummies for outer let class LetDummies { + bit ReadsModeReg; + bit mayRaiseFPException; bit isCommutable; bit isConvertibleToThreeAddress; bit isMoveImm; @@ -35,7 +37,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : let hasSideEffects = 0; let UseNamedOperandTable = 1; let VALU = 1; - let Uses = [EXEC]; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); } class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, @@ -118,7 +120,10 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let ClampLo = P.HasClampLo; let ClampHi = P.HasClampHi; - let Uses = [EXEC]; + let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + + let mayRaiseFPException = ReadsModeReg; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = @@ -160,7 +165,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : VOPProfile Pfl = ps.Pfl; } -// XXX - Is there any reason to distingusih this from regular VOP3 +// XXX - Is there any reason to distinguish this from regular VOP3 // here? class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> : VOP3_Real<ps, EncodingFamily>; @@ -490,10 +495,14 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : let VALU = 1; let SDWA = 1; - let Uses = [EXEC]; - let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst); - let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst); + let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + + let mayRaiseFPException = ReadsModeReg; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); + + let SubtargetPredicate = HasSDWA; + let AssemblerPredicate = HasSDWA; let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA"; @@ -542,8 +551,8 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst); - let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst); + let SubtargetPredicate = HasSDWA9; + let AssemblerPredicate = HasSDWA9; let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9, AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA9"; @@ -561,8 +570,8 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>; class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> { - let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); - let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst); + let SubtargetPredicate = HasSDWA10; + let AssemblerPredicate = HasSDWA10; let DecoderNamespace = "SDWA10"; } @@ -607,7 +616,11 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let VALU = 1; let DPP = 1; let Size = 8; - let Uses = [EXEC]; + + let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret); + + let mayRaiseFPException = ReadsModeReg; + let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]); let isConvergent = 1; string Mnemonic = OpName; @@ -615,7 +628,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; - let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); + let AssemblerPredicate = HasDPP; let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -670,7 +683,7 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16, let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; - let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); + let AssemblerPredicate = HasDPP; let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -702,7 +715,7 @@ class VOP_DPP8<string OpName, VOPProfile P> : let AsmMatchConverter = "cvtDPP8"; let SubtargetPredicate = HasDPP8; - let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst); + let AssemblerPredicate = HasDPP8; let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); |